{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.41, "eval_steps": 3150, "global_step": 12915, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 6.349206349206349e-05, "grad_norm": 6.625, "learning_rate": 0.0006349206349206349, "loss": 10.801105499267578, "step": 2 }, { "epoch": 0.00012698412698412698, "grad_norm": 7.09375, "learning_rate": 0.0012698412698412698, "loss": 9.326807975769043, "step": 4 }, { "epoch": 0.00019047619047619048, "grad_norm": 3.890625, "learning_rate": 0.001904761904761905, "loss": 7.773923397064209, "step": 6 }, { "epoch": 0.00025396825396825396, "grad_norm": 6.96875, "learning_rate": 0.0025396825396825397, "loss": 7.479777812957764, "step": 8 }, { "epoch": 0.00031746031746031746, "grad_norm": 5.96875, "learning_rate": 0.0031746031746031746, "loss": 7.3737945556640625, "step": 10 }, { "epoch": 0.00038095238095238096, "grad_norm": 3.390625, "learning_rate": 0.00380952380952381, "loss": 7.2390546798706055, "step": 12 }, { "epoch": 0.00044444444444444447, "grad_norm": 3.125, "learning_rate": 0.0044444444444444444, "loss": 7.201446533203125, "step": 14 }, { "epoch": 0.0005079365079365079, "grad_norm": 3.046875, "learning_rate": 0.005079365079365079, "loss": 7.195759296417236, "step": 16 }, { "epoch": 0.0005714285714285715, "grad_norm": 2.703125, "learning_rate": 0.005714285714285714, "loss": 7.20925760269165, "step": 18 }, { "epoch": 0.0006349206349206349, "grad_norm": 3.875, "learning_rate": 0.006349206349206349, "loss": 7.200610160827637, "step": 20 }, { "epoch": 0.0006984126984126984, "grad_norm": 3.53125, "learning_rate": 0.006984126984126985, "loss": 7.15569543838501, "step": 22 }, { "epoch": 0.0007619047619047619, "grad_norm": 3.53125, "learning_rate": 0.00761904761904762, "loss": 7.171877384185791, "step": 24 }, { "epoch": 0.0008253968253968254, "grad_norm": 3.546875, "learning_rate": 0.008253968253968255, "loss": 7.162521839141846, "step": 26 }, { "epoch": 0.0008888888888888889, "grad_norm": 4.0625, "learning_rate": 0.008888888888888889, "loss": 7.177415370941162, "step": 28 }, { "epoch": 0.0009523809523809524, "grad_norm": 3.375, "learning_rate": 0.009523809523809525, "loss": 7.142434120178223, "step": 30 }, { "epoch": 0.0010158730158730158, "grad_norm": 4.03125, "learning_rate": 0.010158730158730159, "loss": 7.162986755371094, "step": 32 }, { "epoch": 0.0010793650793650793, "grad_norm": 5.53125, "learning_rate": 0.010793650793650795, "loss": 7.125688552856445, "step": 34 }, { "epoch": 0.001142857142857143, "grad_norm": 3.90625, "learning_rate": 0.011428571428571429, "loss": 7.064617156982422, "step": 36 }, { "epoch": 0.0012063492063492064, "grad_norm": 4.0, "learning_rate": 0.012063492063492064, "loss": 7.0018086433410645, "step": 38 }, { "epoch": 0.0012698412698412698, "grad_norm": 4.78125, "learning_rate": 0.012698412698412698, "loss": 6.9951910972595215, "step": 40 }, { "epoch": 0.0013333333333333333, "grad_norm": 4.34375, "learning_rate": 0.013333333333333334, "loss": 6.8632588386535645, "step": 42 }, { "epoch": 0.0013968253968253967, "grad_norm": 4.5625, "learning_rate": 0.01396825396825397, "loss": 6.8031721115112305, "step": 44 }, { "epoch": 0.0014603174603174604, "grad_norm": 5.90625, "learning_rate": 0.014603174603174604, "loss": 6.659753322601318, "step": 46 }, { "epoch": 0.0015238095238095239, "grad_norm": 5.3125, "learning_rate": 0.01523809523809524, "loss": 6.541396141052246, "step": 48 }, { "epoch": 0.0015873015873015873, "grad_norm": 7.375, "learning_rate": 0.015873015873015872, "loss": 6.320700645446777, "step": 50 }, { "epoch": 0.0016507936507936507, "grad_norm": 7.03125, "learning_rate": 0.01650793650793651, "loss": 6.180545806884766, "step": 52 }, { "epoch": 0.0017142857142857142, "grad_norm": 9.5, "learning_rate": 0.017142857142857144, "loss": 6.006712436676025, "step": 54 }, { "epoch": 0.0017777777777777779, "grad_norm": 5.5625, "learning_rate": 0.017777777777777778, "loss": 5.814790725708008, "step": 56 }, { "epoch": 0.0018412698412698413, "grad_norm": 7.71875, "learning_rate": 0.018412698412698415, "loss": 5.85807991027832, "step": 58 }, { "epoch": 0.0019047619047619048, "grad_norm": 11.0, "learning_rate": 0.01904761904761905, "loss": 5.783743858337402, "step": 60 }, { "epoch": 0.0019682539682539684, "grad_norm": 6.21875, "learning_rate": 0.019682539682539683, "loss": 5.803546905517578, "step": 62 }, { "epoch": 0.0020317460317460317, "grad_norm": 9.875, "learning_rate": 0.020317460317460317, "loss": 5.761871337890625, "step": 64 }, { "epoch": 0.0020952380952380953, "grad_norm": 9.0, "learning_rate": 0.020952380952380955, "loss": 5.712563991546631, "step": 66 }, { "epoch": 0.0021587301587301586, "grad_norm": 8.5, "learning_rate": 0.02158730158730159, "loss": 5.822197914123535, "step": 68 }, { "epoch": 0.0022222222222222222, "grad_norm": 5.3125, "learning_rate": 0.022222222222222223, "loss": 5.670584678649902, "step": 70 }, { "epoch": 0.002285714285714286, "grad_norm": 6.90625, "learning_rate": 0.022857142857142857, "loss": 5.861469745635986, "step": 72 }, { "epoch": 0.002349206349206349, "grad_norm": 7.90625, "learning_rate": 0.023492063492063495, "loss": 5.852883815765381, "step": 74 }, { "epoch": 0.0024126984126984128, "grad_norm": 8.25, "learning_rate": 0.02412698412698413, "loss": 6.089724540710449, "step": 76 }, { "epoch": 0.002476190476190476, "grad_norm": 11.75, "learning_rate": 0.024761904761904763, "loss": 5.999917030334473, "step": 78 }, { "epoch": 0.0025396825396825397, "grad_norm": 11.4375, "learning_rate": 0.025396825396825397, "loss": 6.174747467041016, "step": 80 }, { "epoch": 0.0026031746031746033, "grad_norm": 8.3125, "learning_rate": 0.026031746031746034, "loss": 6.123311519622803, "step": 82 }, { "epoch": 0.0026666666666666666, "grad_norm": 12.1875, "learning_rate": 0.02666666666666667, "loss": 6.0668158531188965, "step": 84 }, { "epoch": 0.0027301587301587302, "grad_norm": 6.75, "learning_rate": 0.0273015873015873, "loss": 6.131885051727295, "step": 86 }, { "epoch": 0.0027936507936507935, "grad_norm": 6.65625, "learning_rate": 0.02793650793650794, "loss": 6.160358428955078, "step": 88 }, { "epoch": 0.002857142857142857, "grad_norm": 9.3125, "learning_rate": 0.02857142857142857, "loss": 6.3066277503967285, "step": 90 }, { "epoch": 0.002920634920634921, "grad_norm": 5.4375, "learning_rate": 0.029206349206349208, "loss": 6.316789627075195, "step": 92 }, { "epoch": 0.002984126984126984, "grad_norm": 6.59375, "learning_rate": 0.029841269841269842, "loss": 6.33724308013916, "step": 94 }, { "epoch": 0.0030476190476190477, "grad_norm": 6.5, "learning_rate": 0.03047619047619048, "loss": 6.288860321044922, "step": 96 }, { "epoch": 0.003111111111111111, "grad_norm": 8.6875, "learning_rate": 0.031111111111111114, "loss": 6.362759113311768, "step": 98 }, { "epoch": 0.0031746031746031746, "grad_norm": 5.1875, "learning_rate": 0.031746031746031744, "loss": 6.259101390838623, "step": 100 }, { "epoch": 0.0032380952380952383, "grad_norm": 4.0, "learning_rate": 0.032380952380952385, "loss": 6.47907829284668, "step": 102 }, { "epoch": 0.0033015873015873015, "grad_norm": 5.28125, "learning_rate": 0.03301587301587302, "loss": 6.57755184173584, "step": 104 }, { "epoch": 0.003365079365079365, "grad_norm": 5.59375, "learning_rate": 0.03365079365079365, "loss": 6.404579162597656, "step": 106 }, { "epoch": 0.0034285714285714284, "grad_norm": 5.40625, "learning_rate": 0.03428571428571429, "loss": 6.36335563659668, "step": 108 }, { "epoch": 0.003492063492063492, "grad_norm": 5.8125, "learning_rate": 0.03492063492063492, "loss": 6.326794147491455, "step": 110 }, { "epoch": 0.0035555555555555557, "grad_norm": 5.34375, "learning_rate": 0.035555555555555556, "loss": 6.243236541748047, "step": 112 }, { "epoch": 0.003619047619047619, "grad_norm": 4.71875, "learning_rate": 0.03619047619047619, "loss": 6.32989501953125, "step": 114 }, { "epoch": 0.0036825396825396826, "grad_norm": 5.21875, "learning_rate": 0.03682539682539683, "loss": 6.213305950164795, "step": 116 }, { "epoch": 0.003746031746031746, "grad_norm": 4.6875, "learning_rate": 0.037460317460317465, "loss": 6.210652828216553, "step": 118 }, { "epoch": 0.0038095238095238095, "grad_norm": 3.890625, "learning_rate": 0.0380952380952381, "loss": 5.987586975097656, "step": 120 }, { "epoch": 0.003873015873015873, "grad_norm": 4.1875, "learning_rate": 0.03873015873015873, "loss": 5.850035190582275, "step": 122 }, { "epoch": 0.003936507936507937, "grad_norm": 3.609375, "learning_rate": 0.03936507936507937, "loss": 5.705539703369141, "step": 124 }, { "epoch": 0.004, "grad_norm": 3.984375, "learning_rate": 0.04000000000000001, "loss": 5.706523418426514, "step": 126 }, { "epoch": 0.004063492063492063, "grad_norm": 4.15625, "learning_rate": 0.040634920634920635, "loss": 5.558238983154297, "step": 128 }, { "epoch": 0.004126984126984127, "grad_norm": 3.96875, "learning_rate": 0.04126984126984127, "loss": 5.4358367919921875, "step": 130 }, { "epoch": 0.004190476190476191, "grad_norm": 3.015625, "learning_rate": 0.04190476190476191, "loss": 5.235808372497559, "step": 132 }, { "epoch": 0.004253968253968254, "grad_norm": 3.40625, "learning_rate": 0.042539682539682544, "loss": 5.232395172119141, "step": 134 }, { "epoch": 0.004317460317460317, "grad_norm": 3.78125, "learning_rate": 0.04317460317460318, "loss": 5.167571067810059, "step": 136 }, { "epoch": 0.004380952380952381, "grad_norm": 6.21875, "learning_rate": 0.04380952380952381, "loss": 5.139188289642334, "step": 138 }, { "epoch": 0.0044444444444444444, "grad_norm": 4.28125, "learning_rate": 0.044444444444444446, "loss": 5.103665351867676, "step": 140 }, { "epoch": 0.004507936507936508, "grad_norm": 2.9375, "learning_rate": 0.04507936507936508, "loss": 5.074988842010498, "step": 142 }, { "epoch": 0.004571428571428572, "grad_norm": 2.59375, "learning_rate": 0.045714285714285714, "loss": 4.910512924194336, "step": 144 }, { "epoch": 0.004634920634920635, "grad_norm": 2.96875, "learning_rate": 0.046349206349206355, "loss": 4.994100093841553, "step": 146 }, { "epoch": 0.004698412698412698, "grad_norm": 2.71875, "learning_rate": 0.04698412698412699, "loss": 4.926704406738281, "step": 148 }, { "epoch": 0.004761904761904762, "grad_norm": 2.90625, "learning_rate": 0.047619047619047616, "loss": 4.945565223693848, "step": 150 }, { "epoch": 0.0048253968253968256, "grad_norm": 3.140625, "learning_rate": 0.04825396825396826, "loss": 4.820583343505859, "step": 152 }, { "epoch": 0.004888888888888889, "grad_norm": 2.375, "learning_rate": 0.04888888888888889, "loss": 4.817602634429932, "step": 154 }, { "epoch": 0.004952380952380952, "grad_norm": 2.890625, "learning_rate": 0.049523809523809526, "loss": 4.858508110046387, "step": 156 }, { "epoch": 0.005015873015873016, "grad_norm": 3.09375, "learning_rate": 0.05015873015873016, "loss": 4.83630895614624, "step": 158 }, { "epoch": 0.005079365079365079, "grad_norm": 3.171875, "learning_rate": 0.050793650793650794, "loss": 4.845445156097412, "step": 160 }, { "epoch": 0.005142857142857143, "grad_norm": 2.671875, "learning_rate": 0.05142857142857143, "loss": 4.703234672546387, "step": 162 }, { "epoch": 0.005206349206349207, "grad_norm": 2.21875, "learning_rate": 0.05206349206349207, "loss": 4.716393947601318, "step": 164 }, { "epoch": 0.00526984126984127, "grad_norm": 1.9921875, "learning_rate": 0.0526984126984127, "loss": 4.729320049285889, "step": 166 }, { "epoch": 0.005333333333333333, "grad_norm": 2.296875, "learning_rate": 0.05333333333333334, "loss": 4.728988170623779, "step": 168 }, { "epoch": 0.005396825396825397, "grad_norm": 2.015625, "learning_rate": 0.05396825396825397, "loss": 4.712806701660156, "step": 170 }, { "epoch": 0.0054603174603174605, "grad_norm": 2.1875, "learning_rate": 0.0546031746031746, "loss": 4.607054710388184, "step": 172 }, { "epoch": 0.005523809523809524, "grad_norm": 1.90625, "learning_rate": 0.055238095238095246, "loss": 4.64877462387085, "step": 174 }, { "epoch": 0.005587301587301587, "grad_norm": 1.8515625, "learning_rate": 0.05587301587301588, "loss": 4.659054279327393, "step": 176 }, { "epoch": 0.005650793650793651, "grad_norm": 2.046875, "learning_rate": 0.05650793650793651, "loss": 4.607472896575928, "step": 178 }, { "epoch": 0.005714285714285714, "grad_norm": 3.015625, "learning_rate": 0.05714285714285714, "loss": 4.644662380218506, "step": 180 }, { "epoch": 0.0057777777777777775, "grad_norm": 2.125, "learning_rate": 0.057777777777777775, "loss": 4.561500549316406, "step": 182 }, { "epoch": 0.005841269841269842, "grad_norm": 1.9296875, "learning_rate": 0.058412698412698416, "loss": 4.61305570602417, "step": 184 }, { "epoch": 0.005904761904761905, "grad_norm": 2.0, "learning_rate": 0.05904761904761905, "loss": 4.505429744720459, "step": 186 }, { "epoch": 0.005968253968253968, "grad_norm": 1.796875, "learning_rate": 0.059682539682539684, "loss": 4.505371570587158, "step": 188 }, { "epoch": 0.006031746031746032, "grad_norm": 1.5859375, "learning_rate": 0.06031746031746032, "loss": 4.484126091003418, "step": 190 }, { "epoch": 0.006095238095238095, "grad_norm": 1.796875, "learning_rate": 0.06095238095238096, "loss": 4.488917350769043, "step": 192 }, { "epoch": 0.006158730158730159, "grad_norm": 2.0625, "learning_rate": 0.06158730158730159, "loss": 4.548356533050537, "step": 194 }, { "epoch": 0.006222222222222222, "grad_norm": 1.7578125, "learning_rate": 0.06222222222222223, "loss": 4.44257926940918, "step": 196 }, { "epoch": 0.006285714285714286, "grad_norm": 1.7890625, "learning_rate": 0.06285714285714286, "loss": 4.4452056884765625, "step": 198 }, { "epoch": 0.006349206349206349, "grad_norm": 1.8515625, "learning_rate": 0.06349206349206349, "loss": 4.5187506675720215, "step": 200 }, { "epoch": 0.006412698412698412, "grad_norm": 1.7265625, "learning_rate": 0.06412698412698413, "loss": 4.399511814117432, "step": 202 }, { "epoch": 0.0064761904761904765, "grad_norm": 1.703125, "learning_rate": 0.06476190476190477, "loss": 4.358022212982178, "step": 204 }, { "epoch": 0.00653968253968254, "grad_norm": 1.671875, "learning_rate": 0.0653968253968254, "loss": 4.394007205963135, "step": 206 }, { "epoch": 0.006603174603174603, "grad_norm": 1.59375, "learning_rate": 0.06603174603174604, "loss": 4.378864765167236, "step": 208 }, { "epoch": 0.006666666666666667, "grad_norm": 1.578125, "learning_rate": 0.06666666666666667, "loss": 4.373222827911377, "step": 210 }, { "epoch": 0.00673015873015873, "grad_norm": 1.390625, "learning_rate": 0.0673015873015873, "loss": 4.297969341278076, "step": 212 }, { "epoch": 0.0067936507936507936, "grad_norm": 1.578125, "learning_rate": 0.06793650793650795, "loss": 4.358753204345703, "step": 214 }, { "epoch": 0.006857142857142857, "grad_norm": 2.125, "learning_rate": 0.06857142857142857, "loss": 4.382944107055664, "step": 216 }, { "epoch": 0.006920634920634921, "grad_norm": 1.7421875, "learning_rate": 0.0692063492063492, "loss": 4.358743667602539, "step": 218 }, { "epoch": 0.006984126984126984, "grad_norm": 1.9375, "learning_rate": 0.06984126984126984, "loss": 4.28198766708374, "step": 220 }, { "epoch": 0.007047619047619047, "grad_norm": 1.90625, "learning_rate": 0.07047619047619048, "loss": 4.325778007507324, "step": 222 }, { "epoch": 0.0071111111111111115, "grad_norm": 1.5703125, "learning_rate": 0.07111111111111111, "loss": 4.283350467681885, "step": 224 }, { "epoch": 0.007174603174603175, "grad_norm": 1.875, "learning_rate": 0.07174603174603175, "loss": 4.292675971984863, "step": 226 }, { "epoch": 0.007238095238095238, "grad_norm": 1.5078125, "learning_rate": 0.07238095238095238, "loss": 4.210683345794678, "step": 228 }, { "epoch": 0.007301587301587302, "grad_norm": 1.4296875, "learning_rate": 0.07301587301587302, "loss": 4.2952704429626465, "step": 230 }, { "epoch": 0.007365079365079365, "grad_norm": 1.75, "learning_rate": 0.07365079365079366, "loss": 4.2953972816467285, "step": 232 }, { "epoch": 0.0074285714285714285, "grad_norm": 1.6484375, "learning_rate": 0.07428571428571429, "loss": 4.21522331237793, "step": 234 }, { "epoch": 0.007492063492063492, "grad_norm": 1.5703125, "learning_rate": 0.07492063492063493, "loss": 4.231412410736084, "step": 236 }, { "epoch": 0.007555555555555556, "grad_norm": 1.6484375, "learning_rate": 0.07555555555555556, "loss": 4.215019226074219, "step": 238 }, { "epoch": 0.007619047619047619, "grad_norm": 1.703125, "learning_rate": 0.0761904761904762, "loss": 4.229800701141357, "step": 240 }, { "epoch": 0.007682539682539682, "grad_norm": 1.2109375, "learning_rate": 0.07682539682539684, "loss": 4.152937412261963, "step": 242 }, { "epoch": 0.007746031746031746, "grad_norm": 1.3203125, "learning_rate": 0.07746031746031747, "loss": 4.1424665451049805, "step": 244 }, { "epoch": 0.00780952380952381, "grad_norm": 1.5234375, "learning_rate": 0.0780952380952381, "loss": 4.163131237030029, "step": 246 }, { "epoch": 0.007873015873015874, "grad_norm": 1.5078125, "learning_rate": 0.07873015873015873, "loss": 4.1024556159973145, "step": 248 }, { "epoch": 0.007936507936507936, "grad_norm": 1.2734375, "learning_rate": 0.07936507936507936, "loss": 4.060139179229736, "step": 250 }, { "epoch": 0.008, "grad_norm": 1.265625, "learning_rate": 0.08000000000000002, "loss": 4.058903217315674, "step": 252 }, { "epoch": 0.008063492063492064, "grad_norm": 1.2421875, "learning_rate": 0.08063492063492064, "loss": 4.025557994842529, "step": 254 }, { "epoch": 0.008126984126984127, "grad_norm": 1.375, "learning_rate": 0.08126984126984127, "loss": 3.9608795642852783, "step": 256 }, { "epoch": 0.00819047619047619, "grad_norm": 1.6875, "learning_rate": 0.08190476190476191, "loss": 3.9889020919799805, "step": 258 }, { "epoch": 0.008253968253968255, "grad_norm": 1.6171875, "learning_rate": 0.08253968253968254, "loss": 3.962678909301758, "step": 260 }, { "epoch": 0.008317460317460317, "grad_norm": 1.4375, "learning_rate": 0.08317460317460318, "loss": 3.89566707611084, "step": 262 }, { "epoch": 0.008380952380952381, "grad_norm": 1.3203125, "learning_rate": 0.08380952380952382, "loss": 3.938316583633423, "step": 264 }, { "epoch": 0.008444444444444444, "grad_norm": 1.0859375, "learning_rate": 0.08444444444444445, "loss": 3.9036355018615723, "step": 266 }, { "epoch": 0.008507936507936508, "grad_norm": 1.4609375, "learning_rate": 0.08507936507936509, "loss": 3.845884084701538, "step": 268 }, { "epoch": 0.008571428571428572, "grad_norm": 1.078125, "learning_rate": 0.08571428571428572, "loss": 3.865518569946289, "step": 270 }, { "epoch": 0.008634920634920634, "grad_norm": 1.265625, "learning_rate": 0.08634920634920636, "loss": 3.879326105117798, "step": 272 }, { "epoch": 0.008698412698412698, "grad_norm": 1.4140625, "learning_rate": 0.086984126984127, "loss": 3.819279193878174, "step": 274 }, { "epoch": 0.008761904761904762, "grad_norm": 1.7734375, "learning_rate": 0.08761904761904762, "loss": 3.8500442504882812, "step": 276 }, { "epoch": 0.008825396825396825, "grad_norm": 1.6484375, "learning_rate": 0.08825396825396825, "loss": 3.7705399990081787, "step": 278 }, { "epoch": 0.008888888888888889, "grad_norm": 1.125, "learning_rate": 0.08888888888888889, "loss": 3.777470111846924, "step": 280 }, { "epoch": 0.008952380952380953, "grad_norm": 1.03125, "learning_rate": 0.08952380952380953, "loss": 3.7298450469970703, "step": 282 }, { "epoch": 0.009015873015873015, "grad_norm": 1.046875, "learning_rate": 0.09015873015873016, "loss": 3.794740915298462, "step": 284 }, { "epoch": 0.00907936507936508, "grad_norm": 0.97265625, "learning_rate": 0.0907936507936508, "loss": 3.737441062927246, "step": 286 }, { "epoch": 0.009142857142857144, "grad_norm": 1.1640625, "learning_rate": 0.09142857142857143, "loss": 3.7325448989868164, "step": 288 }, { "epoch": 0.009206349206349206, "grad_norm": 1.265625, "learning_rate": 0.09206349206349207, "loss": 3.7607827186584473, "step": 290 }, { "epoch": 0.00926984126984127, "grad_norm": 1.0546875, "learning_rate": 0.09269841269841271, "loss": 3.6826963424682617, "step": 292 }, { "epoch": 0.009333333333333334, "grad_norm": 1.3515625, "learning_rate": 0.09333333333333334, "loss": 3.7264299392700195, "step": 294 }, { "epoch": 0.009396825396825396, "grad_norm": 0.9765625, "learning_rate": 0.09396825396825398, "loss": 3.7098519802093506, "step": 296 }, { "epoch": 0.00946031746031746, "grad_norm": 0.90234375, "learning_rate": 0.0946031746031746, "loss": 3.676978826522827, "step": 298 }, { "epoch": 0.009523809523809525, "grad_norm": 0.81640625, "learning_rate": 0.09523809523809523, "loss": 3.687487840652466, "step": 300 }, { "epoch": 0.009587301587301587, "grad_norm": 1.1484375, "learning_rate": 0.09587301587301589, "loss": 3.673914670944214, "step": 302 }, { "epoch": 0.009650793650793651, "grad_norm": 1.0625, "learning_rate": 0.09650793650793651, "loss": 3.670208215713501, "step": 304 }, { "epoch": 0.009714285714285713, "grad_norm": 1.0859375, "learning_rate": 0.09714285714285714, "loss": 3.633423328399658, "step": 306 }, { "epoch": 0.009777777777777778, "grad_norm": 0.73046875, "learning_rate": 0.09777777777777778, "loss": 3.66752552986145, "step": 308 }, { "epoch": 0.009841269841269842, "grad_norm": 0.7578125, "learning_rate": 0.09841269841269841, "loss": 3.6017661094665527, "step": 310 }, { "epoch": 0.009904761904761904, "grad_norm": 0.80078125, "learning_rate": 0.09904761904761905, "loss": 3.6119208335876465, "step": 312 }, { "epoch": 0.009968253968253968, "grad_norm": 1.359375, "learning_rate": 0.09968253968253969, "loss": 3.6598455905914307, "step": 314 }, { "epoch": 0.010031746031746032, "grad_norm": 1.671875, "learning_rate": 0.1, "loss": 3.5938162803649902, "step": 316 }, { "epoch": 0.010095238095238095, "grad_norm": 1.59375, "learning_rate": 0.1, "loss": 3.562201499938965, "step": 318 }, { "epoch": 0.010158730158730159, "grad_norm": 0.953125, "learning_rate": 0.1, "loss": 3.568826913833618, "step": 320 }, { "epoch": 0.010222222222222223, "grad_norm": 0.97265625, "learning_rate": 0.1, "loss": 3.5671839714050293, "step": 322 }, { "epoch": 0.010285714285714285, "grad_norm": 1.3046875, "learning_rate": 0.1, "loss": 3.5616164207458496, "step": 324 }, { "epoch": 0.01034920634920635, "grad_norm": 1.4140625, "learning_rate": 0.1, "loss": 3.4934566020965576, "step": 326 }, { "epoch": 0.010412698412698413, "grad_norm": 0.8515625, "learning_rate": 0.1, "loss": 3.53564715385437, "step": 328 }, { "epoch": 0.010476190476190476, "grad_norm": 0.77734375, "learning_rate": 0.1, "loss": 3.4729278087615967, "step": 330 }, { "epoch": 0.01053968253968254, "grad_norm": 1.2578125, "learning_rate": 0.1, "loss": 3.511234998703003, "step": 332 }, { "epoch": 0.010603174603174604, "grad_norm": 1.203125, "learning_rate": 0.1, "loss": 3.5119216442108154, "step": 334 }, { "epoch": 0.010666666666666666, "grad_norm": 0.796875, "learning_rate": 0.1, "loss": 3.4519412517547607, "step": 336 }, { "epoch": 0.01073015873015873, "grad_norm": 0.56640625, "learning_rate": 0.1, "loss": 3.4901010990142822, "step": 338 }, { "epoch": 0.010793650793650795, "grad_norm": 0.66796875, "learning_rate": 0.1, "loss": 3.4803192615509033, "step": 340 }, { "epoch": 0.010857142857142857, "grad_norm": 0.47265625, "learning_rate": 0.1, "loss": 3.473706007003784, "step": 342 }, { "epoch": 0.010920634920634921, "grad_norm": 0.5625, "learning_rate": 0.1, "loss": 3.4645204544067383, "step": 344 }, { "epoch": 0.010984126984126983, "grad_norm": 0.9140625, "learning_rate": 0.1, "loss": 3.453927993774414, "step": 346 }, { "epoch": 0.011047619047619047, "grad_norm": 0.84765625, "learning_rate": 0.1, "loss": 3.47428035736084, "step": 348 }, { "epoch": 0.011111111111111112, "grad_norm": 0.90625, "learning_rate": 0.1, "loss": 3.4447450637817383, "step": 350 }, { "epoch": 0.011174603174603174, "grad_norm": 1.0, "learning_rate": 0.1, "loss": 3.4368631839752197, "step": 352 }, { "epoch": 0.011238095238095238, "grad_norm": 0.671875, "learning_rate": 0.1, "loss": 3.45393705368042, "step": 354 }, { "epoch": 0.011301587301587302, "grad_norm": 0.431640625, "learning_rate": 0.1, "loss": 3.400446653366089, "step": 356 }, { "epoch": 0.011365079365079364, "grad_norm": 0.474609375, "learning_rate": 0.1, "loss": 3.399395704269409, "step": 358 }, { "epoch": 0.011428571428571429, "grad_norm": 0.72265625, "learning_rate": 0.1, "loss": 3.3908536434173584, "step": 360 }, { "epoch": 0.011492063492063493, "grad_norm": 1.4296875, "learning_rate": 0.1, "loss": 3.406240224838257, "step": 362 }, { "epoch": 0.011555555555555555, "grad_norm": 1.5390625, "learning_rate": 0.1, "loss": 3.3849599361419678, "step": 364 }, { "epoch": 0.011619047619047619, "grad_norm": 1.2890625, "learning_rate": 0.1, "loss": 3.3550808429718018, "step": 366 }, { "epoch": 0.011682539682539683, "grad_norm": 0.86328125, "learning_rate": 0.1, "loss": 3.330925226211548, "step": 368 }, { "epoch": 0.011746031746031746, "grad_norm": 0.63671875, "learning_rate": 0.1, "loss": 3.3373827934265137, "step": 370 }, { "epoch": 0.01180952380952381, "grad_norm": 0.3828125, "learning_rate": 0.1, "loss": 3.358631134033203, "step": 372 }, { "epoch": 0.011873015873015874, "grad_norm": 0.71484375, "learning_rate": 0.1, "loss": 3.3574931621551514, "step": 374 }, { "epoch": 0.011936507936507936, "grad_norm": 0.99609375, "learning_rate": 0.1, "loss": 3.3619513511657715, "step": 376 }, { "epoch": 0.012, "grad_norm": 0.9765625, "learning_rate": 0.1, "loss": 3.350740432739258, "step": 378 }, { "epoch": 0.012063492063492064, "grad_norm": 0.95703125, "learning_rate": 0.1, "loss": 3.328664541244507, "step": 380 }, { "epoch": 0.012126984126984127, "grad_norm": 0.70703125, "learning_rate": 0.1, "loss": 3.3298966884613037, "step": 382 }, { "epoch": 0.01219047619047619, "grad_norm": 0.65234375, "learning_rate": 0.1, "loss": 3.3007969856262207, "step": 384 }, { "epoch": 0.012253968253968253, "grad_norm": 0.443359375, "learning_rate": 0.1, "loss": 3.3021342754364014, "step": 386 }, { "epoch": 0.012317460317460317, "grad_norm": 0.44921875, "learning_rate": 0.1, "loss": 3.262967109680176, "step": 388 }, { "epoch": 0.012380952380952381, "grad_norm": 0.984375, "learning_rate": 0.1, "loss": 3.287736177444458, "step": 390 }, { "epoch": 0.012444444444444444, "grad_norm": 1.171875, "learning_rate": 0.1, "loss": 3.303879976272583, "step": 392 }, { "epoch": 0.012507936507936508, "grad_norm": 1.390625, "learning_rate": 0.1, "loss": 3.289409637451172, "step": 394 }, { "epoch": 0.012571428571428572, "grad_norm": 1.2890625, "learning_rate": 0.1, "loss": 3.262639284133911, "step": 396 }, { "epoch": 0.012634920634920634, "grad_norm": 0.89453125, "learning_rate": 0.1, "loss": 3.289180278778076, "step": 398 }, { "epoch": 0.012698412698412698, "grad_norm": 0.640625, "learning_rate": 0.1, "loss": 3.259063482284546, "step": 400 }, { "epoch": 0.012761904761904763, "grad_norm": 0.6875, "learning_rate": 0.1, "loss": 3.2444591522216797, "step": 402 }, { "epoch": 0.012825396825396825, "grad_norm": 1.1640625, "learning_rate": 0.1, "loss": 3.232419490814209, "step": 404 }, { "epoch": 0.012888888888888889, "grad_norm": 1.328125, "learning_rate": 0.1, "loss": 3.248533010482788, "step": 406 }, { "epoch": 0.012952380952380953, "grad_norm": 1.0234375, "learning_rate": 0.1, "loss": 3.2404844760894775, "step": 408 }, { "epoch": 0.013015873015873015, "grad_norm": 0.609375, "learning_rate": 0.1, "loss": 3.225863218307495, "step": 410 }, { "epoch": 0.01307936507936508, "grad_norm": 0.4765625, "learning_rate": 0.1, "loss": 3.230936288833618, "step": 412 }, { "epoch": 0.013142857142857144, "grad_norm": 0.5078125, "learning_rate": 0.1, "loss": 3.214132070541382, "step": 414 }, { "epoch": 0.013206349206349206, "grad_norm": 0.76171875, "learning_rate": 0.1, "loss": 3.205977201461792, "step": 416 }, { "epoch": 0.01326984126984127, "grad_norm": 1.0859375, "learning_rate": 0.1, "loss": 3.221465826034546, "step": 418 }, { "epoch": 0.013333333333333334, "grad_norm": 1.4375, "learning_rate": 0.1, "loss": 3.2179253101348877, "step": 420 }, { "epoch": 0.013396825396825397, "grad_norm": 1.1484375, "learning_rate": 0.1, "loss": 3.191230535507202, "step": 422 }, { "epoch": 0.01346031746031746, "grad_norm": 0.8515625, "learning_rate": 0.1, "loss": 3.1901586055755615, "step": 424 }, { "epoch": 0.013523809523809523, "grad_norm": 0.24609375, "learning_rate": 0.1, "loss": 3.1957831382751465, "step": 426 }, { "epoch": 0.013587301587301587, "grad_norm": 0.76953125, "learning_rate": 0.1, "loss": 3.1834909915924072, "step": 428 }, { "epoch": 0.013650793650793651, "grad_norm": 0.78125, "learning_rate": 0.1, "loss": 3.1901142597198486, "step": 430 }, { "epoch": 0.013714285714285714, "grad_norm": 0.453125, "learning_rate": 0.1, "loss": 3.1972007751464844, "step": 432 }, { "epoch": 0.013777777777777778, "grad_norm": 0.265625, "learning_rate": 0.1, "loss": 3.166499137878418, "step": 434 }, { "epoch": 0.013841269841269842, "grad_norm": 0.2099609375, "learning_rate": 0.1, "loss": 3.172959089279175, "step": 436 }, { "epoch": 0.013904761904761904, "grad_norm": 0.271484375, "learning_rate": 0.1, "loss": 3.1746485233306885, "step": 438 }, { "epoch": 0.013968253968253968, "grad_norm": 0.2080078125, "learning_rate": 0.1, "loss": 3.1343274116516113, "step": 440 }, { "epoch": 0.014031746031746032, "grad_norm": 0.376953125, "learning_rate": 0.1, "loss": 3.1571145057678223, "step": 442 }, { "epoch": 0.014095238095238095, "grad_norm": 0.80859375, "learning_rate": 0.1, "loss": 3.1749491691589355, "step": 444 }, { "epoch": 0.014158730158730159, "grad_norm": 1.21875, "learning_rate": 0.1, "loss": 3.162214517593384, "step": 446 }, { "epoch": 0.014222222222222223, "grad_norm": 1.0859375, "learning_rate": 0.1, "loss": 3.1633312702178955, "step": 448 }, { "epoch": 0.014285714285714285, "grad_norm": 0.482421875, "learning_rate": 0.1, "loss": 3.165236711502075, "step": 450 }, { "epoch": 0.01434920634920635, "grad_norm": 0.236328125, "learning_rate": 0.1, "loss": 3.1206204891204834, "step": 452 }, { "epoch": 0.014412698412698413, "grad_norm": 0.1943359375, "learning_rate": 0.1, "loss": 3.137232780456543, "step": 454 }, { "epoch": 0.014476190476190476, "grad_norm": 0.61328125, "learning_rate": 0.1, "loss": 3.1554222106933594, "step": 456 }, { "epoch": 0.01453968253968254, "grad_norm": 1.6484375, "learning_rate": 0.1, "loss": 3.1488876342773438, "step": 458 }, { "epoch": 0.014603174603174604, "grad_norm": 2.03125, "learning_rate": 0.1, "loss": 3.1315207481384277, "step": 460 }, { "epoch": 0.014666666666666666, "grad_norm": 2.28125, "learning_rate": 0.1, "loss": 3.1631195545196533, "step": 462 }, { "epoch": 0.01473015873015873, "grad_norm": 1.71875, "learning_rate": 0.1, "loss": 3.107161283493042, "step": 464 }, { "epoch": 0.014793650793650793, "grad_norm": 0.96875, "learning_rate": 0.1, "loss": 3.146433115005493, "step": 466 }, { "epoch": 0.014857142857142857, "grad_norm": 0.1953125, "learning_rate": 0.1, "loss": 3.1239233016967773, "step": 468 }, { "epoch": 0.014920634920634921, "grad_norm": 0.490234375, "learning_rate": 0.1, "loss": 3.124284267425537, "step": 470 }, { "epoch": 0.014984126984126983, "grad_norm": 0.92578125, "learning_rate": 0.1, "loss": 3.1383025646209717, "step": 472 }, { "epoch": 0.015047619047619048, "grad_norm": 0.94140625, "learning_rate": 0.1, "loss": 3.1105380058288574, "step": 474 }, { "epoch": 0.015111111111111112, "grad_norm": 0.92578125, "learning_rate": 0.1, "loss": 3.112609624862671, "step": 476 }, { "epoch": 0.015174603174603174, "grad_norm": 1.015625, "learning_rate": 0.1, "loss": 3.0979175567626953, "step": 478 }, { "epoch": 0.015238095238095238, "grad_norm": 0.828125, "learning_rate": 0.1, "loss": 3.1076698303222656, "step": 480 }, { "epoch": 0.015301587301587302, "grad_norm": 0.8203125, "learning_rate": 0.1, "loss": 3.119893789291382, "step": 482 }, { "epoch": 0.015365079365079365, "grad_norm": 0.3984375, "learning_rate": 0.1, "loss": 3.0772712230682373, "step": 484 }, { "epoch": 0.015428571428571429, "grad_norm": 0.2080078125, "learning_rate": 0.1, "loss": 3.084977388381958, "step": 486 }, { "epoch": 0.015492063492063493, "grad_norm": 0.2353515625, "learning_rate": 0.1, "loss": 3.1142385005950928, "step": 488 }, { "epoch": 0.015555555555555555, "grad_norm": 0.52734375, "learning_rate": 0.1, "loss": 3.086958885192871, "step": 490 }, { "epoch": 0.01561904761904762, "grad_norm": 0.9609375, "learning_rate": 0.1, "loss": 3.0883636474609375, "step": 492 }, { "epoch": 0.015682539682539683, "grad_norm": 1.1015625, "learning_rate": 0.1, "loss": 3.0651333332061768, "step": 494 }, { "epoch": 0.015746031746031747, "grad_norm": 1.1953125, "learning_rate": 0.1, "loss": 3.0651001930236816, "step": 496 }, { "epoch": 0.015809523809523808, "grad_norm": 1.125, "learning_rate": 0.1, "loss": 3.0667943954467773, "step": 498 }, { "epoch": 0.015873015873015872, "grad_norm": 1.09375, "learning_rate": 0.1, "loss": 3.083754777908325, "step": 500 }, { "epoch": 0.015936507936507936, "grad_norm": 1.0390625, "learning_rate": 0.1, "loss": 3.061857223510742, "step": 502 }, { "epoch": 0.016, "grad_norm": 0.91796875, "learning_rate": 0.1, "loss": 3.05370831489563, "step": 504 }, { "epoch": 0.016063492063492064, "grad_norm": 0.859375, "learning_rate": 0.1, "loss": 3.0453765392303467, "step": 506 }, { "epoch": 0.01612698412698413, "grad_norm": 0.482421875, "learning_rate": 0.1, "loss": 3.057920455932617, "step": 508 }, { "epoch": 0.01619047619047619, "grad_norm": 0.423828125, "learning_rate": 0.1, "loss": 3.050530195236206, "step": 510 }, { "epoch": 0.016253968253968253, "grad_norm": 0.828125, "learning_rate": 0.1, "loss": 3.0628905296325684, "step": 512 }, { "epoch": 0.016317460317460317, "grad_norm": 0.9921875, "learning_rate": 0.1, "loss": 3.076108932495117, "step": 514 }, { "epoch": 0.01638095238095238, "grad_norm": 1.109375, "learning_rate": 0.1, "loss": 3.04408860206604, "step": 516 }, { "epoch": 0.016444444444444446, "grad_norm": 1.203125, "learning_rate": 0.1, "loss": 3.0683529376983643, "step": 518 }, { "epoch": 0.01650793650793651, "grad_norm": 1.5703125, "learning_rate": 0.1, "loss": 3.0220510959625244, "step": 520 }, { "epoch": 0.01657142857142857, "grad_norm": 1.7109375, "learning_rate": 0.1, "loss": 3.0539910793304443, "step": 522 }, { "epoch": 0.016634920634920634, "grad_norm": 1.71875, "learning_rate": 0.1, "loss": 3.068899154663086, "step": 524 }, { "epoch": 0.0166984126984127, "grad_norm": 1.1640625, "learning_rate": 0.1, "loss": 3.063685417175293, "step": 526 }, { "epoch": 0.016761904761904763, "grad_norm": 0.84375, "learning_rate": 0.1, "loss": 3.0484814643859863, "step": 528 }, { "epoch": 0.016825396825396827, "grad_norm": 0.38671875, "learning_rate": 0.1, "loss": 3.0499517917633057, "step": 530 }, { "epoch": 0.016888888888888887, "grad_norm": 0.28125, "learning_rate": 0.1, "loss": 3.0436675548553467, "step": 532 }, { "epoch": 0.01695238095238095, "grad_norm": 0.462890625, "learning_rate": 0.1, "loss": 3.025122880935669, "step": 534 }, { "epoch": 0.017015873015873016, "grad_norm": 0.890625, "learning_rate": 0.1, "loss": 3.0118329524993896, "step": 536 }, { "epoch": 0.01707936507936508, "grad_norm": 1.0546875, "learning_rate": 0.1, "loss": 3.021244764328003, "step": 538 }, { "epoch": 0.017142857142857144, "grad_norm": 1.03125, "learning_rate": 0.1, "loss": 3.02365779876709, "step": 540 }, { "epoch": 0.017206349206349208, "grad_norm": 1.078125, "learning_rate": 0.1, "loss": 3.0257298946380615, "step": 542 }, { "epoch": 0.01726984126984127, "grad_norm": 1.0859375, "learning_rate": 0.1, "loss": 3.0071115493774414, "step": 544 }, { "epoch": 0.017333333333333333, "grad_norm": 1.078125, "learning_rate": 0.1, "loss": 3.0089266300201416, "step": 546 }, { "epoch": 0.017396825396825397, "grad_norm": 1.15625, "learning_rate": 0.1, "loss": 3.0062456130981445, "step": 548 }, { "epoch": 0.01746031746031746, "grad_norm": 1.328125, "learning_rate": 0.1, "loss": 2.9699337482452393, "step": 550 }, { "epoch": 0.017523809523809525, "grad_norm": 1.3984375, "learning_rate": 0.1, "loss": 3.011267900466919, "step": 552 }, { "epoch": 0.01758730158730159, "grad_norm": 1.015625, "learning_rate": 0.1, "loss": 2.992412805557251, "step": 554 }, { "epoch": 0.01765079365079365, "grad_norm": 1.0078125, "learning_rate": 0.1, "loss": 3.009934425354004, "step": 556 }, { "epoch": 0.017714285714285714, "grad_norm": 0.96875, "learning_rate": 0.1, "loss": 3.0075061321258545, "step": 558 }, { "epoch": 0.017777777777777778, "grad_norm": 0.69140625, "learning_rate": 0.1, "loss": 2.9767303466796875, "step": 560 }, { "epoch": 0.017841269841269842, "grad_norm": 0.486328125, "learning_rate": 0.1, "loss": 2.983800172805786, "step": 562 }, { "epoch": 0.017904761904761906, "grad_norm": 0.484375, "learning_rate": 0.1, "loss": 2.9960591793060303, "step": 564 }, { "epoch": 0.017968253968253967, "grad_norm": 0.609375, "learning_rate": 0.1, "loss": 2.9825940132141113, "step": 566 }, { "epoch": 0.01803174603174603, "grad_norm": 0.6953125, "learning_rate": 0.1, "loss": 2.96907639503479, "step": 568 }, { "epoch": 0.018095238095238095, "grad_norm": 1.078125, "learning_rate": 0.1, "loss": 2.971550226211548, "step": 570 }, { "epoch": 0.01815873015873016, "grad_norm": 1.3125, "learning_rate": 0.1, "loss": 3.0110714435577393, "step": 572 }, { "epoch": 0.018222222222222223, "grad_norm": 1.3515625, "learning_rate": 0.1, "loss": 2.9801905155181885, "step": 574 }, { "epoch": 0.018285714285714287, "grad_norm": 1.296875, "learning_rate": 0.1, "loss": 2.9873695373535156, "step": 576 }, { "epoch": 0.018349206349206348, "grad_norm": 1.2734375, "learning_rate": 0.1, "loss": 2.9655425548553467, "step": 578 }, { "epoch": 0.018412698412698412, "grad_norm": 1.1328125, "learning_rate": 0.1, "loss": 2.9705631732940674, "step": 580 }, { "epoch": 0.018476190476190476, "grad_norm": 1.09375, "learning_rate": 0.1, "loss": 2.9764044284820557, "step": 582 }, { "epoch": 0.01853968253968254, "grad_norm": 0.87109375, "learning_rate": 0.1, "loss": 2.954822063446045, "step": 584 }, { "epoch": 0.018603174603174604, "grad_norm": 0.671875, "learning_rate": 0.1, "loss": 2.9759950637817383, "step": 586 }, { "epoch": 0.018666666666666668, "grad_norm": 0.58203125, "learning_rate": 0.1, "loss": 2.9718713760375977, "step": 588 }, { "epoch": 0.01873015873015873, "grad_norm": 0.6171875, "learning_rate": 0.1, "loss": 2.9884769916534424, "step": 590 }, { "epoch": 0.018793650793650793, "grad_norm": 1.171875, "learning_rate": 0.1, "loss": 2.988987922668457, "step": 592 }, { "epoch": 0.018857142857142857, "grad_norm": 1.3046875, "learning_rate": 0.1, "loss": 2.9998598098754883, "step": 594 }, { "epoch": 0.01892063492063492, "grad_norm": 1.2578125, "learning_rate": 0.1, "loss": 2.9731922149658203, "step": 596 }, { "epoch": 0.018984126984126985, "grad_norm": 1.3515625, "learning_rate": 0.1, "loss": 2.9923856258392334, "step": 598 }, { "epoch": 0.01904761904761905, "grad_norm": 1.4453125, "learning_rate": 0.1, "loss": 2.970440149307251, "step": 600 }, { "epoch": 0.01911111111111111, "grad_norm": 1.546875, "learning_rate": 0.1, "loss": 2.9418489933013916, "step": 602 }, { "epoch": 0.019174603174603174, "grad_norm": 1.59375, "learning_rate": 0.1, "loss": 2.9626381397247314, "step": 604 }, { "epoch": 0.019238095238095238, "grad_norm": 1.4609375, "learning_rate": 0.1, "loss": 2.9721691608428955, "step": 606 }, { "epoch": 0.019301587301587302, "grad_norm": 1.296875, "learning_rate": 0.1, "loss": 2.9747581481933594, "step": 608 }, { "epoch": 0.019365079365079366, "grad_norm": 1.2421875, "learning_rate": 0.1, "loss": 2.9440951347351074, "step": 610 }, { "epoch": 0.019428571428571427, "grad_norm": 1.265625, "learning_rate": 0.1, "loss": 2.9569873809814453, "step": 612 }, { "epoch": 0.01949206349206349, "grad_norm": 1.421875, "learning_rate": 0.1, "loss": 2.942096471786499, "step": 614 }, { "epoch": 0.019555555555555555, "grad_norm": 1.359375, "learning_rate": 0.1, "loss": 2.9659054279327393, "step": 616 }, { "epoch": 0.01961904761904762, "grad_norm": 1.203125, "learning_rate": 0.1, "loss": 2.9299516677856445, "step": 618 }, { "epoch": 0.019682539682539683, "grad_norm": 1.03125, "learning_rate": 0.1, "loss": 2.9392285346984863, "step": 620 }, { "epoch": 0.019746031746031747, "grad_norm": 0.58984375, "learning_rate": 0.1, "loss": 2.963862180709839, "step": 622 }, { "epoch": 0.019809523809523808, "grad_norm": 0.10546875, "learning_rate": 0.1, "loss": 2.946873426437378, "step": 624 }, { "epoch": 0.019873015873015872, "grad_norm": 1.046875, "learning_rate": 0.1, "loss": 2.944869041442871, "step": 626 }, { "epoch": 0.019936507936507936, "grad_norm": 0.84375, "learning_rate": 0.1, "loss": 2.89503812789917, "step": 628 }, { "epoch": 0.02, "grad_norm": 1.0, "learning_rate": 0.1, "loss": 2.937197208404541, "step": 630 }, { "epoch": 0.020063492063492065, "grad_norm": 0.99609375, "learning_rate": 0.1, "loss": 2.9275424480438232, "step": 632 }, { "epoch": 0.02012698412698413, "grad_norm": 0.96484375, "learning_rate": 0.1, "loss": 2.962554693222046, "step": 634 }, { "epoch": 0.02019047619047619, "grad_norm": 0.8828125, "learning_rate": 0.1, "loss": 2.914341926574707, "step": 636 }, { "epoch": 0.020253968253968253, "grad_norm": 1.1015625, "learning_rate": 0.1, "loss": 2.926637887954712, "step": 638 }, { "epoch": 0.020317460317460317, "grad_norm": 1.0390625, "learning_rate": 0.1, "loss": 2.9183430671691895, "step": 640 }, { "epoch": 0.02038095238095238, "grad_norm": 0.7578125, "learning_rate": 0.1, "loss": 2.9340102672576904, "step": 642 }, { "epoch": 0.020444444444444446, "grad_norm": 0.498046875, "learning_rate": 0.1, "loss": 2.886336326599121, "step": 644 }, { "epoch": 0.020507936507936506, "grad_norm": 0.65234375, "learning_rate": 0.1, "loss": 2.923856496810913, "step": 646 }, { "epoch": 0.02057142857142857, "grad_norm": 0.86328125, "learning_rate": 0.1, "loss": 2.915475368499756, "step": 648 }, { "epoch": 0.020634920634920634, "grad_norm": 1.140625, "learning_rate": 0.1, "loss": 2.9157824516296387, "step": 650 }, { "epoch": 0.0206984126984127, "grad_norm": 1.2578125, "learning_rate": 0.1, "loss": 2.8953075408935547, "step": 652 }, { "epoch": 0.020761904761904763, "grad_norm": 1.3828125, "learning_rate": 0.1, "loss": 2.9346120357513428, "step": 654 }, { "epoch": 0.020825396825396827, "grad_norm": 1.296875, "learning_rate": 0.1, "loss": 2.8997642993927, "step": 656 }, { "epoch": 0.020888888888888887, "grad_norm": 1.296875, "learning_rate": 0.1, "loss": 2.8891003131866455, "step": 658 }, { "epoch": 0.02095238095238095, "grad_norm": 1.3125, "learning_rate": 0.1, "loss": 2.8894336223602295, "step": 660 }, { "epoch": 0.021015873015873016, "grad_norm": 1.4609375, "learning_rate": 0.1, "loss": 2.9011178016662598, "step": 662 }, { "epoch": 0.02107936507936508, "grad_norm": 1.40625, "learning_rate": 0.1, "loss": 2.910674571990967, "step": 664 }, { "epoch": 0.021142857142857144, "grad_norm": 1.171875, "learning_rate": 0.1, "loss": 2.89648175239563, "step": 666 }, { "epoch": 0.021206349206349208, "grad_norm": 0.9140625, "learning_rate": 0.1, "loss": 2.909072160720825, "step": 668 }, { "epoch": 0.02126984126984127, "grad_norm": 0.1376953125, "learning_rate": 0.1, "loss": 2.895249128341675, "step": 670 }, { "epoch": 0.021333333333333333, "grad_norm": 1.4921875, "learning_rate": 0.1, "loss": 2.897890567779541, "step": 672 }, { "epoch": 0.021396825396825397, "grad_norm": 1.390625, "learning_rate": 0.1, "loss": 2.9007468223571777, "step": 674 }, { "epoch": 0.02146031746031746, "grad_norm": 1.3046875, "learning_rate": 0.1, "loss": 2.8998095989227295, "step": 676 }, { "epoch": 0.021523809523809525, "grad_norm": 1.34375, "learning_rate": 0.1, "loss": 2.902493715286255, "step": 678 }, { "epoch": 0.02158730158730159, "grad_norm": 1.5546875, "learning_rate": 0.1, "loss": 2.8992369174957275, "step": 680 }, { "epoch": 0.02165079365079365, "grad_norm": 1.6015625, "learning_rate": 0.1, "loss": 2.861717939376831, "step": 682 }, { "epoch": 0.021714285714285714, "grad_norm": 1.421875, "learning_rate": 0.1, "loss": 2.872589111328125, "step": 684 }, { "epoch": 0.021777777777777778, "grad_norm": 1.2265625, "learning_rate": 0.1, "loss": 2.9159390926361084, "step": 686 }, { "epoch": 0.021841269841269842, "grad_norm": 1.0625, "learning_rate": 0.1, "loss": 2.8983540534973145, "step": 688 }, { "epoch": 0.021904761904761906, "grad_norm": 0.84375, "learning_rate": 0.1, "loss": 2.862527370452881, "step": 690 }, { "epoch": 0.021968253968253967, "grad_norm": 0.875, "learning_rate": 0.1, "loss": 2.843687057495117, "step": 692 }, { "epoch": 0.02203174603174603, "grad_norm": 0.9453125, "learning_rate": 0.1, "loss": 2.8590738773345947, "step": 694 }, { "epoch": 0.022095238095238095, "grad_norm": 1.125, "learning_rate": 0.1, "loss": 2.8599703311920166, "step": 696 }, { "epoch": 0.02215873015873016, "grad_norm": 1.1796875, "learning_rate": 0.1, "loss": 2.8687939643859863, "step": 698 }, { "epoch": 0.022222222222222223, "grad_norm": 1.203125, "learning_rate": 0.1, "loss": 2.852980375289917, "step": 700 }, { "epoch": 0.022285714285714287, "grad_norm": 1.09375, "learning_rate": 0.1, "loss": 2.8813085556030273, "step": 702 }, { "epoch": 0.022349206349206348, "grad_norm": 1.28125, "learning_rate": 0.1, "loss": 2.825546979904175, "step": 704 }, { "epoch": 0.022412698412698412, "grad_norm": 1.5546875, "learning_rate": 0.1, "loss": 2.826983690261841, "step": 706 }, { "epoch": 0.022476190476190476, "grad_norm": 1.6796875, "learning_rate": 0.1, "loss": 2.8536782264709473, "step": 708 }, { "epoch": 0.02253968253968254, "grad_norm": 1.4296875, "learning_rate": 0.1, "loss": 2.86342716217041, "step": 710 }, { "epoch": 0.022603174603174604, "grad_norm": 1.234375, "learning_rate": 0.1, "loss": 2.8362812995910645, "step": 712 }, { "epoch": 0.02266666666666667, "grad_norm": 1.1875, "learning_rate": 0.1, "loss": 2.8674542903900146, "step": 714 }, { "epoch": 0.02273015873015873, "grad_norm": 1.1875, "learning_rate": 0.1, "loss": 2.8358709812164307, "step": 716 }, { "epoch": 0.022793650793650793, "grad_norm": 1.1796875, "learning_rate": 0.1, "loss": 2.8566155433654785, "step": 718 }, { "epoch": 0.022857142857142857, "grad_norm": 1.15625, "learning_rate": 0.1, "loss": 2.842625379562378, "step": 720 }, { "epoch": 0.02292063492063492, "grad_norm": 1.3203125, "learning_rate": 0.1, "loss": 2.8484599590301514, "step": 722 }, { "epoch": 0.022984126984126985, "grad_norm": 1.3984375, "learning_rate": 0.1, "loss": 2.8319098949432373, "step": 724 }, { "epoch": 0.023047619047619046, "grad_norm": 1.6328125, "learning_rate": 0.1, "loss": 2.8707053661346436, "step": 726 }, { "epoch": 0.02311111111111111, "grad_norm": 1.5390625, "learning_rate": 0.1, "loss": 2.8266420364379883, "step": 728 }, { "epoch": 0.023174603174603174, "grad_norm": 1.4453125, "learning_rate": 0.1, "loss": 2.8606181144714355, "step": 730 }, { "epoch": 0.023238095238095238, "grad_norm": 1.3359375, "learning_rate": 0.1, "loss": 2.8627452850341797, "step": 732 }, { "epoch": 0.023301587301587302, "grad_norm": 1.15625, "learning_rate": 0.1, "loss": 2.815427541732788, "step": 734 }, { "epoch": 0.023365079365079366, "grad_norm": 1.3515625, "learning_rate": 0.1, "loss": 2.8468315601348877, "step": 736 }, { "epoch": 0.023428571428571427, "grad_norm": 1.453125, "learning_rate": 0.1, "loss": 2.7851171493530273, "step": 738 }, { "epoch": 0.02349206349206349, "grad_norm": 1.515625, "learning_rate": 0.1, "loss": 2.811434268951416, "step": 740 }, { "epoch": 0.023555555555555555, "grad_norm": 1.6171875, "learning_rate": 0.1, "loss": 2.8238027095794678, "step": 742 }, { "epoch": 0.02361904761904762, "grad_norm": 1.5859375, "learning_rate": 0.1, "loss": 2.8118274211883545, "step": 744 }, { "epoch": 0.023682539682539683, "grad_norm": 1.34375, "learning_rate": 0.1, "loss": 2.833188533782959, "step": 746 }, { "epoch": 0.023746031746031748, "grad_norm": 1.21875, "learning_rate": 0.1, "loss": 2.8205175399780273, "step": 748 }, { "epoch": 0.023809523809523808, "grad_norm": 0.4765625, "learning_rate": 0.1, "loss": 2.8091824054718018, "step": 750 }, { "epoch": 0.023873015873015872, "grad_norm": 0.1591796875, "learning_rate": 0.1, "loss": 2.812747001647949, "step": 752 }, { "epoch": 0.023936507936507936, "grad_norm": 1.09375, "learning_rate": 0.1, "loss": 2.8240513801574707, "step": 754 }, { "epoch": 0.024, "grad_norm": 1.1640625, "learning_rate": 0.1, "loss": 2.7974672317504883, "step": 756 }, { "epoch": 0.024063492063492065, "grad_norm": 1.09375, "learning_rate": 0.1, "loss": 2.7879927158355713, "step": 758 }, { "epoch": 0.02412698412698413, "grad_norm": 1.0546875, "learning_rate": 0.1, "loss": 2.8080880641937256, "step": 760 }, { "epoch": 0.02419047619047619, "grad_norm": 1.0390625, "learning_rate": 0.1, "loss": 2.818265914916992, "step": 762 }, { "epoch": 0.024253968253968253, "grad_norm": 1.09375, "learning_rate": 0.1, "loss": 2.808403730392456, "step": 764 }, { "epoch": 0.024317460317460318, "grad_norm": 1.171875, "learning_rate": 0.1, "loss": 2.8216428756713867, "step": 766 }, { "epoch": 0.02438095238095238, "grad_norm": 1.0859375, "learning_rate": 0.1, "loss": 2.790566921234131, "step": 768 }, { "epoch": 0.024444444444444446, "grad_norm": 0.98046875, "learning_rate": 0.1, "loss": 2.7953078746795654, "step": 770 }, { "epoch": 0.024507936507936506, "grad_norm": 1.0078125, "learning_rate": 0.1, "loss": 2.84751558303833, "step": 772 }, { "epoch": 0.02457142857142857, "grad_norm": 0.96484375, "learning_rate": 0.1, "loss": 2.7915351390838623, "step": 774 }, { "epoch": 0.024634920634920635, "grad_norm": 0.921875, "learning_rate": 0.1, "loss": 2.7894504070281982, "step": 776 }, { "epoch": 0.0246984126984127, "grad_norm": 0.1171875, "learning_rate": 0.1, "loss": 2.8002054691314697, "step": 778 }, { "epoch": 0.024761904761904763, "grad_norm": 1.7578125, "learning_rate": 0.1, "loss": 2.786452054977417, "step": 780 }, { "epoch": 0.024825396825396827, "grad_norm": 1.859375, "learning_rate": 0.1, "loss": 2.794553279876709, "step": 782 }, { "epoch": 0.024888888888888887, "grad_norm": 1.7578125, "learning_rate": 0.1, "loss": 2.7987453937530518, "step": 784 }, { "epoch": 0.02495238095238095, "grad_norm": 1.625, "learning_rate": 0.1, "loss": 2.804255962371826, "step": 786 }, { "epoch": 0.025015873015873016, "grad_norm": 1.5625, "learning_rate": 0.1, "loss": 2.770111560821533, "step": 788 }, { "epoch": 0.02507936507936508, "grad_norm": 1.640625, "learning_rate": 0.1, "loss": 2.8004913330078125, "step": 790 }, { "epoch": 0.025142857142857144, "grad_norm": 1.7890625, "learning_rate": 0.1, "loss": 2.8117387294769287, "step": 792 }, { "epoch": 0.025206349206349208, "grad_norm": 1.296875, "learning_rate": 0.1, "loss": 2.808410406112671, "step": 794 }, { "epoch": 0.02526984126984127, "grad_norm": 1.0234375, "learning_rate": 0.1, "loss": 2.833763599395752, "step": 796 }, { "epoch": 0.025333333333333333, "grad_norm": 0.89453125, "learning_rate": 0.1, "loss": 2.7889950275421143, "step": 798 }, { "epoch": 0.025396825396825397, "grad_norm": 1.1953125, "learning_rate": 0.1, "loss": 2.77565598487854, "step": 800 }, { "epoch": 0.02546031746031746, "grad_norm": 1.3515625, "learning_rate": 0.1, "loss": 2.802668809890747, "step": 802 }, { "epoch": 0.025523809523809525, "grad_norm": 1.3984375, "learning_rate": 0.1, "loss": 2.7716064453125, "step": 804 }, { "epoch": 0.025587301587301586, "grad_norm": 1.546875, "learning_rate": 0.1, "loss": 2.770905017852783, "step": 806 }, { "epoch": 0.02565079365079365, "grad_norm": 1.484375, "learning_rate": 0.1, "loss": 2.776806592941284, "step": 808 }, { "epoch": 0.025714285714285714, "grad_norm": 1.4375, "learning_rate": 0.1, "loss": 2.7744343280792236, "step": 810 }, { "epoch": 0.025777777777777778, "grad_norm": 1.1875, "learning_rate": 0.1, "loss": 2.801470994949341, "step": 812 }, { "epoch": 0.025841269841269842, "grad_norm": 0.93359375, "learning_rate": 0.1, "loss": 2.7741458415985107, "step": 814 }, { "epoch": 0.025904761904761906, "grad_norm": 1.2890625, "learning_rate": 0.1, "loss": 2.7982726097106934, "step": 816 }, { "epoch": 0.025968253968253967, "grad_norm": 1.34375, "learning_rate": 0.1, "loss": 2.77854323387146, "step": 818 }, { "epoch": 0.02603174603174603, "grad_norm": 1.265625, "learning_rate": 0.1, "loss": 2.7719085216522217, "step": 820 }, { "epoch": 0.026095238095238095, "grad_norm": 1.1875, "learning_rate": 0.1, "loss": 2.7646918296813965, "step": 822 }, { "epoch": 0.02615873015873016, "grad_norm": 1.0390625, "learning_rate": 0.1, "loss": 2.7878687381744385, "step": 824 }, { "epoch": 0.026222222222222223, "grad_norm": 0.9765625, "learning_rate": 0.1, "loss": 2.778719425201416, "step": 826 }, { "epoch": 0.026285714285714287, "grad_norm": 0.96875, "learning_rate": 0.1, "loss": 2.7779152393341064, "step": 828 }, { "epoch": 0.026349206349206348, "grad_norm": 1.15625, "learning_rate": 0.1, "loss": 2.7839818000793457, "step": 830 }, { "epoch": 0.026412698412698412, "grad_norm": 1.4140625, "learning_rate": 0.1, "loss": 2.7726778984069824, "step": 832 }, { "epoch": 0.026476190476190476, "grad_norm": 1.5078125, "learning_rate": 0.1, "loss": 2.770808458328247, "step": 834 }, { "epoch": 0.02653968253968254, "grad_norm": 1.484375, "learning_rate": 0.1, "loss": 2.7722103595733643, "step": 836 }, { "epoch": 0.026603174603174604, "grad_norm": 1.4375, "learning_rate": 0.1, "loss": 2.7786643505096436, "step": 838 }, { "epoch": 0.02666666666666667, "grad_norm": 1.265625, "learning_rate": 0.1, "loss": 2.7710602283477783, "step": 840 }, { "epoch": 0.02673015873015873, "grad_norm": 1.3125, "learning_rate": 0.1, "loss": 2.7921838760375977, "step": 842 }, { "epoch": 0.026793650793650793, "grad_norm": 1.2265625, "learning_rate": 0.1, "loss": 2.775670051574707, "step": 844 }, { "epoch": 0.026857142857142857, "grad_norm": 0.91015625, "learning_rate": 0.1, "loss": 2.7906174659729004, "step": 846 }, { "epoch": 0.02692063492063492, "grad_norm": 0.90625, "learning_rate": 0.1, "loss": 2.7591354846954346, "step": 848 }, { "epoch": 0.026984126984126985, "grad_norm": 1.359375, "learning_rate": 0.1, "loss": 2.776829242706299, "step": 850 }, { "epoch": 0.027047619047619046, "grad_norm": 1.6640625, "learning_rate": 0.1, "loss": 2.776118755340576, "step": 852 }, { "epoch": 0.02711111111111111, "grad_norm": 1.5234375, "learning_rate": 0.1, "loss": 2.789954662322998, "step": 854 }, { "epoch": 0.027174603174603174, "grad_norm": 1.2578125, "learning_rate": 0.1, "loss": 2.7577691078186035, "step": 856 }, { "epoch": 0.02723809523809524, "grad_norm": 1.1875, "learning_rate": 0.1, "loss": 2.7567570209503174, "step": 858 }, { "epoch": 0.027301587301587302, "grad_norm": 1.171875, "learning_rate": 0.1, "loss": 2.737403392791748, "step": 860 }, { "epoch": 0.027365079365079367, "grad_norm": 1.3046875, "learning_rate": 0.1, "loss": 2.8078079223632812, "step": 862 }, { "epoch": 0.027428571428571427, "grad_norm": 1.265625, "learning_rate": 0.1, "loss": 2.7714951038360596, "step": 864 }, { "epoch": 0.02749206349206349, "grad_norm": 1.2421875, "learning_rate": 0.1, "loss": 2.772411823272705, "step": 866 }, { "epoch": 0.027555555555555555, "grad_norm": 1.265625, "learning_rate": 0.1, "loss": 2.75351881980896, "step": 868 }, { "epoch": 0.02761904761904762, "grad_norm": 1.28125, "learning_rate": 0.1, "loss": 2.749804973602295, "step": 870 }, { "epoch": 0.027682539682539684, "grad_norm": 1.34375, "learning_rate": 0.1, "loss": 2.744938850402832, "step": 872 }, { "epoch": 0.027746031746031748, "grad_norm": 1.4140625, "learning_rate": 0.1, "loss": 2.7577157020568848, "step": 874 }, { "epoch": 0.02780952380952381, "grad_norm": 1.5, "learning_rate": 0.1, "loss": 2.7605154514312744, "step": 876 }, { "epoch": 0.027873015873015872, "grad_norm": 1.375, "learning_rate": 0.1, "loss": 2.7187612056732178, "step": 878 }, { "epoch": 0.027936507936507936, "grad_norm": 1.25, "learning_rate": 0.1, "loss": 2.7382328510284424, "step": 880 }, { "epoch": 0.028, "grad_norm": 1.2578125, "learning_rate": 0.1, "loss": 2.7806878089904785, "step": 882 }, { "epoch": 0.028063492063492065, "grad_norm": 1.2265625, "learning_rate": 0.1, "loss": 2.749373435974121, "step": 884 }, { "epoch": 0.028126984126984125, "grad_norm": 1.1484375, "learning_rate": 0.1, "loss": 2.7527058124542236, "step": 886 }, { "epoch": 0.02819047619047619, "grad_norm": 1.2109375, "learning_rate": 0.1, "loss": 2.7383816242218018, "step": 888 }, { "epoch": 0.028253968253968254, "grad_norm": 1.21875, "learning_rate": 0.1, "loss": 2.7506749629974365, "step": 890 }, { "epoch": 0.028317460317460318, "grad_norm": 1.1015625, "learning_rate": 0.1, "loss": 2.730591297149658, "step": 892 }, { "epoch": 0.02838095238095238, "grad_norm": 1.1796875, "learning_rate": 0.1, "loss": 2.7507312297821045, "step": 894 }, { "epoch": 0.028444444444444446, "grad_norm": 1.1484375, "learning_rate": 0.1, "loss": 2.715519428253174, "step": 896 }, { "epoch": 0.028507936507936506, "grad_norm": 1.1875, "learning_rate": 0.1, "loss": 2.73425555229187, "step": 898 }, { "epoch": 0.02857142857142857, "grad_norm": 1.2734375, "learning_rate": 0.1, "loss": 2.7391855716705322, "step": 900 }, { "epoch": 0.028634920634920635, "grad_norm": 1.3359375, "learning_rate": 0.1, "loss": 2.7261483669281006, "step": 902 }, { "epoch": 0.0286984126984127, "grad_norm": 1.25, "learning_rate": 0.1, "loss": 2.716630458831787, "step": 904 }, { "epoch": 0.028761904761904763, "grad_norm": 1.28125, "learning_rate": 0.1, "loss": 2.7437281608581543, "step": 906 }, { "epoch": 0.028825396825396827, "grad_norm": 1.3125, "learning_rate": 0.1, "loss": 2.7325570583343506, "step": 908 }, { "epoch": 0.028888888888888888, "grad_norm": 1.3359375, "learning_rate": 0.1, "loss": 2.7495415210723877, "step": 910 }, { "epoch": 0.02895238095238095, "grad_norm": 0.9765625, "learning_rate": 0.1, "loss": 2.76709246635437, "step": 912 }, { "epoch": 0.029015873015873016, "grad_norm": 0.306640625, "learning_rate": 0.1, "loss": 2.7282824516296387, "step": 914 }, { "epoch": 0.02907936507936508, "grad_norm": 0.365234375, "learning_rate": 0.1, "loss": 2.759833812713623, "step": 916 }, { "epoch": 0.029142857142857144, "grad_norm": 1.15625, "learning_rate": 0.1, "loss": 2.725062847137451, "step": 918 }, { "epoch": 0.029206349206349208, "grad_norm": 1.15625, "learning_rate": 0.1, "loss": 2.726435422897339, "step": 920 }, { "epoch": 0.02926984126984127, "grad_norm": 1.0625, "learning_rate": 0.1, "loss": 2.769052505493164, "step": 922 }, { "epoch": 0.029333333333333333, "grad_norm": 0.91015625, "learning_rate": 0.1, "loss": 2.7416439056396484, "step": 924 }, { "epoch": 0.029396825396825397, "grad_norm": 1.140625, "learning_rate": 0.1, "loss": 2.7405333518981934, "step": 926 }, { "epoch": 0.02946031746031746, "grad_norm": 1.171875, "learning_rate": 0.1, "loss": 2.7387545108795166, "step": 928 }, { "epoch": 0.029523809523809525, "grad_norm": 1.3515625, "learning_rate": 0.1, "loss": 2.752532482147217, "step": 930 }, { "epoch": 0.029587301587301586, "grad_norm": 1.4609375, "learning_rate": 0.1, "loss": 2.7284107208251953, "step": 932 }, { "epoch": 0.02965079365079365, "grad_norm": 1.3046875, "learning_rate": 0.1, "loss": 2.7543957233428955, "step": 934 }, { "epoch": 0.029714285714285714, "grad_norm": 1.171875, "learning_rate": 0.1, "loss": 2.728212594985962, "step": 936 }, { "epoch": 0.029777777777777778, "grad_norm": 1.109375, "learning_rate": 0.1, "loss": 2.727241277694702, "step": 938 }, { "epoch": 0.029841269841269842, "grad_norm": 0.921875, "learning_rate": 0.1, "loss": 2.748721122741699, "step": 940 }, { "epoch": 0.029904761904761906, "grad_norm": 1.1171875, "learning_rate": 0.1, "loss": 2.7443525791168213, "step": 942 }, { "epoch": 0.029968253968253967, "grad_norm": 1.2265625, "learning_rate": 0.1, "loss": 2.7418339252471924, "step": 944 }, { "epoch": 0.03003174603174603, "grad_norm": 1.1796875, "learning_rate": 0.1, "loss": 2.730739116668701, "step": 946 }, { "epoch": 0.030095238095238095, "grad_norm": 1.171875, "learning_rate": 0.1, "loss": 2.738229513168335, "step": 948 }, { "epoch": 0.03015873015873016, "grad_norm": 1.1640625, "learning_rate": 0.1, "loss": 2.729902505874634, "step": 950 }, { "epoch": 0.030222222222222223, "grad_norm": 1.1796875, "learning_rate": 0.1, "loss": 2.7078258991241455, "step": 952 }, { "epoch": 0.030285714285714287, "grad_norm": 1.0625, "learning_rate": 0.1, "loss": 2.725454092025757, "step": 954 }, { "epoch": 0.030349206349206348, "grad_norm": 0.9921875, "learning_rate": 0.1, "loss": 2.7212934494018555, "step": 956 }, { "epoch": 0.030412698412698412, "grad_norm": 0.98828125, "learning_rate": 0.1, "loss": 2.7365057468414307, "step": 958 }, { "epoch": 0.030476190476190476, "grad_norm": 1.015625, "learning_rate": 0.1, "loss": 2.744489908218384, "step": 960 }, { "epoch": 0.03053968253968254, "grad_norm": 0.953125, "learning_rate": 0.1, "loss": 2.699479579925537, "step": 962 }, { "epoch": 0.030603174603174604, "grad_norm": 0.9765625, "learning_rate": 0.1, "loss": 2.7472617626190186, "step": 964 }, { "epoch": 0.030666666666666665, "grad_norm": 0.828125, "learning_rate": 0.1, "loss": 2.7531509399414062, "step": 966 }, { "epoch": 0.03073015873015873, "grad_norm": 0.94140625, "learning_rate": 0.1, "loss": 2.7409565448760986, "step": 968 }, { "epoch": 0.030793650793650793, "grad_norm": 0.8125, "learning_rate": 0.1, "loss": 2.724207639694214, "step": 970 }, { "epoch": 0.030857142857142857, "grad_norm": 1.1953125, "learning_rate": 0.1, "loss": 2.740102767944336, "step": 972 }, { "epoch": 0.03092063492063492, "grad_norm": 1.2109375, "learning_rate": 0.1, "loss": 2.7390551567077637, "step": 974 }, { "epoch": 0.030984126984126985, "grad_norm": 0.91015625, "learning_rate": 0.1, "loss": 2.73054838180542, "step": 976 }, { "epoch": 0.031047619047619046, "grad_norm": 0.92578125, "learning_rate": 0.1, "loss": 2.7184252738952637, "step": 978 }, { "epoch": 0.03111111111111111, "grad_norm": 0.59765625, "learning_rate": 0.1, "loss": 2.7201907634735107, "step": 980 }, { "epoch": 0.031174603174603174, "grad_norm": 0.55078125, "learning_rate": 0.1, "loss": 2.7291829586029053, "step": 982 }, { "epoch": 0.03123809523809524, "grad_norm": 0.6171875, "learning_rate": 0.1, "loss": 2.719303846359253, "step": 984 }, { "epoch": 0.0313015873015873, "grad_norm": 1.265625, "learning_rate": 0.1, "loss": 2.725726366043091, "step": 986 }, { "epoch": 0.03136507936507937, "grad_norm": 1.4296875, "learning_rate": 0.1, "loss": 2.737971782684326, "step": 988 }, { "epoch": 0.03142857142857143, "grad_norm": 1.3828125, "learning_rate": 0.1, "loss": 2.716897964477539, "step": 990 }, { "epoch": 0.031492063492063495, "grad_norm": 1.2734375, "learning_rate": 0.1, "loss": 2.6911563873291016, "step": 992 }, { "epoch": 0.03155555555555556, "grad_norm": 1.15625, "learning_rate": 0.1, "loss": 2.728645086288452, "step": 994 }, { "epoch": 0.031619047619047616, "grad_norm": 1.0, "learning_rate": 0.1, "loss": 2.719571590423584, "step": 996 }, { "epoch": 0.03168253968253968, "grad_norm": 0.94921875, "learning_rate": 0.1, "loss": 2.7137248516082764, "step": 998 }, { "epoch": 0.031746031746031744, "grad_norm": 1.0703125, "learning_rate": 0.1, "loss": 2.704073429107666, "step": 1000 }, { "epoch": 0.03180952380952381, "grad_norm": 1.0390625, "learning_rate": 0.1, "loss": 2.7135114669799805, "step": 1002 }, { "epoch": 0.03187301587301587, "grad_norm": 1.0390625, "learning_rate": 0.1, "loss": 2.72782039642334, "step": 1004 }, { "epoch": 0.03193650793650794, "grad_norm": 0.9921875, "learning_rate": 0.1, "loss": 2.7318570613861084, "step": 1006 }, { "epoch": 0.032, "grad_norm": 0.9296875, "learning_rate": 0.1, "loss": 2.6905698776245117, "step": 1008 }, { "epoch": 0.032063492063492065, "grad_norm": 1.0703125, "learning_rate": 0.1, "loss": 2.6933441162109375, "step": 1010 }, { "epoch": 0.03212698412698413, "grad_norm": 1.0390625, "learning_rate": 0.1, "loss": 2.7407314777374268, "step": 1012 }, { "epoch": 0.03219047619047619, "grad_norm": 1.078125, "learning_rate": 0.1, "loss": 2.7128958702087402, "step": 1014 }, { "epoch": 0.03225396825396826, "grad_norm": 1.1171875, "learning_rate": 0.1, "loss": 2.7540509700775146, "step": 1016 }, { "epoch": 0.032317460317460314, "grad_norm": 1.078125, "learning_rate": 0.1, "loss": 2.7136805057525635, "step": 1018 }, { "epoch": 0.03238095238095238, "grad_norm": 1.1484375, "learning_rate": 0.1, "loss": 2.709768056869507, "step": 1020 }, { "epoch": 0.03244444444444444, "grad_norm": 1.1328125, "learning_rate": 0.1, "loss": 2.6986868381500244, "step": 1022 }, { "epoch": 0.032507936507936507, "grad_norm": 1.078125, "learning_rate": 0.1, "loss": 2.689272165298462, "step": 1024 }, { "epoch": 0.03257142857142857, "grad_norm": 1.0078125, "learning_rate": 0.1, "loss": 2.730846405029297, "step": 1026 }, { "epoch": 0.032634920634920635, "grad_norm": 1.0390625, "learning_rate": 0.1, "loss": 2.70094895362854, "step": 1028 }, { "epoch": 0.0326984126984127, "grad_norm": 0.859375, "learning_rate": 0.1, "loss": 2.6822962760925293, "step": 1030 }, { "epoch": 0.03276190476190476, "grad_norm": 1.078125, "learning_rate": 0.1, "loss": 2.696621894836426, "step": 1032 }, { "epoch": 0.03282539682539683, "grad_norm": 0.75390625, "learning_rate": 0.1, "loss": 2.691941261291504, "step": 1034 }, { "epoch": 0.03288888888888889, "grad_norm": 0.796875, "learning_rate": 0.1, "loss": 2.7094075679779053, "step": 1036 }, { "epoch": 0.032952380952380955, "grad_norm": 0.74609375, "learning_rate": 0.1, "loss": 2.709071636199951, "step": 1038 }, { "epoch": 0.03301587301587302, "grad_norm": 0.953125, "learning_rate": 0.1, "loss": 2.6925957202911377, "step": 1040 }, { "epoch": 0.033079365079365076, "grad_norm": 1.0546875, "learning_rate": 0.1, "loss": 2.7152931690216064, "step": 1042 }, { "epoch": 0.03314285714285714, "grad_norm": 0.921875, "learning_rate": 0.1, "loss": 2.705169439315796, "step": 1044 }, { "epoch": 0.033206349206349205, "grad_norm": 1.078125, "learning_rate": 0.1, "loss": 2.6950294971466064, "step": 1046 }, { "epoch": 0.03326984126984127, "grad_norm": 0.95703125, "learning_rate": 0.1, "loss": 2.672210693359375, "step": 1048 }, { "epoch": 0.03333333333333333, "grad_norm": 0.92578125, "learning_rate": 0.1, "loss": 2.715027332305908, "step": 1050 }, { "epoch": 0.0333968253968254, "grad_norm": 0.68359375, "learning_rate": 0.1, "loss": 2.683643341064453, "step": 1052 }, { "epoch": 0.03346031746031746, "grad_norm": 0.4453125, "learning_rate": 0.1, "loss": 2.675783157348633, "step": 1054 }, { "epoch": 0.033523809523809525, "grad_norm": 0.7109375, "learning_rate": 0.1, "loss": 2.6654181480407715, "step": 1056 }, { "epoch": 0.03358730158730159, "grad_norm": 1.3515625, "learning_rate": 0.1, "loss": 2.685187578201294, "step": 1058 }, { "epoch": 0.03365079365079365, "grad_norm": 1.0546875, "learning_rate": 0.1, "loss": 2.6827166080474854, "step": 1060 }, { "epoch": 0.03371428571428572, "grad_norm": 0.73046875, "learning_rate": 0.1, "loss": 2.6614224910736084, "step": 1062 }, { "epoch": 0.033777777777777775, "grad_norm": 1.296875, "learning_rate": 0.1, "loss": 2.699321746826172, "step": 1064 }, { "epoch": 0.03384126984126984, "grad_norm": 0.703125, "learning_rate": 0.1, "loss": 2.69404673576355, "step": 1066 }, { "epoch": 0.0339047619047619, "grad_norm": 1.03125, "learning_rate": 0.1, "loss": 2.702907085418701, "step": 1068 }, { "epoch": 0.03396825396825397, "grad_norm": 0.81640625, "learning_rate": 0.1, "loss": 2.675267457962036, "step": 1070 }, { "epoch": 0.03403174603174603, "grad_norm": 1.15625, "learning_rate": 0.1, "loss": 2.6859309673309326, "step": 1072 }, { "epoch": 0.034095238095238095, "grad_norm": 0.73828125, "learning_rate": 0.1, "loss": 2.6990792751312256, "step": 1074 }, { "epoch": 0.03415873015873016, "grad_norm": 0.859375, "learning_rate": 0.1, "loss": 2.7048330307006836, "step": 1076 }, { "epoch": 0.03422222222222222, "grad_norm": 0.66796875, "learning_rate": 0.1, "loss": 2.6692311763763428, "step": 1078 }, { "epoch": 0.03428571428571429, "grad_norm": 0.5625, "learning_rate": 0.1, "loss": 2.6762001514434814, "step": 1080 }, { "epoch": 0.03434920634920635, "grad_norm": 0.77734375, "learning_rate": 0.1, "loss": 2.7075562477111816, "step": 1082 }, { "epoch": 0.034412698412698416, "grad_norm": 1.09375, "learning_rate": 0.1, "loss": 2.677772283554077, "step": 1084 }, { "epoch": 0.03447619047619047, "grad_norm": 1.1484375, "learning_rate": 0.1, "loss": 2.6849639415740967, "step": 1086 }, { "epoch": 0.03453968253968254, "grad_norm": 0.9609375, "learning_rate": 0.1, "loss": 2.681471586227417, "step": 1088 }, { "epoch": 0.0346031746031746, "grad_norm": 1.03125, "learning_rate": 0.1, "loss": 2.683176279067993, "step": 1090 }, { "epoch": 0.034666666666666665, "grad_norm": 0.75, "learning_rate": 0.1, "loss": 2.641491651535034, "step": 1092 }, { "epoch": 0.03473015873015873, "grad_norm": 0.63671875, "learning_rate": 0.1, "loss": 2.6605255603790283, "step": 1094 }, { "epoch": 0.03479365079365079, "grad_norm": 0.80078125, "learning_rate": 0.1, "loss": 2.6729602813720703, "step": 1096 }, { "epoch": 0.03485714285714286, "grad_norm": 0.88671875, "learning_rate": 0.1, "loss": 2.663205623626709, "step": 1098 }, { "epoch": 0.03492063492063492, "grad_norm": 0.6875, "learning_rate": 0.1, "loss": 2.6716582775115967, "step": 1100 }, { "epoch": 0.034984126984126986, "grad_norm": 0.39453125, "learning_rate": 0.1, "loss": 2.6422319412231445, "step": 1102 }, { "epoch": 0.03504761904761905, "grad_norm": 0.416015625, "learning_rate": 0.1, "loss": 2.7335588932037354, "step": 1104 }, { "epoch": 0.035111111111111114, "grad_norm": 0.42578125, "learning_rate": 0.1, "loss": 2.6882553100585938, "step": 1106 }, { "epoch": 0.03517460317460318, "grad_norm": 0.33984375, "learning_rate": 0.1, "loss": 2.706934690475464, "step": 1108 }, { "epoch": 0.035238095238095235, "grad_norm": 0.373046875, "learning_rate": 0.1, "loss": 2.6444387435913086, "step": 1110 }, { "epoch": 0.0353015873015873, "grad_norm": 0.52734375, "learning_rate": 0.1, "loss": 2.6673097610473633, "step": 1112 }, { "epoch": 0.03536507936507936, "grad_norm": 0.84375, "learning_rate": 0.1, "loss": 2.685286521911621, "step": 1114 }, { "epoch": 0.03542857142857143, "grad_norm": 1.140625, "learning_rate": 0.1, "loss": 2.660116672515869, "step": 1116 }, { "epoch": 0.03549206349206349, "grad_norm": 1.0, "learning_rate": 0.1, "loss": 2.6831166744232178, "step": 1118 }, { "epoch": 0.035555555555555556, "grad_norm": 0.875, "learning_rate": 0.1, "loss": 2.6393473148345947, "step": 1120 }, { "epoch": 0.03561904761904762, "grad_norm": 1.1015625, "learning_rate": 0.1, "loss": 2.6893651485443115, "step": 1122 }, { "epoch": 0.035682539682539684, "grad_norm": 0.84765625, "learning_rate": 0.1, "loss": 2.674464464187622, "step": 1124 }, { "epoch": 0.03574603174603175, "grad_norm": 0.80859375, "learning_rate": 0.1, "loss": 2.664659023284912, "step": 1126 }, { "epoch": 0.03580952380952381, "grad_norm": 0.55859375, "learning_rate": 0.1, "loss": 2.665724992752075, "step": 1128 }, { "epoch": 0.035873015873015876, "grad_norm": 0.314453125, "learning_rate": 0.1, "loss": 2.672153949737549, "step": 1130 }, { "epoch": 0.03593650793650793, "grad_norm": 0.25, "learning_rate": 0.1, "loss": 2.670444965362549, "step": 1132 }, { "epoch": 0.036, "grad_norm": 1.03125, "learning_rate": 0.1, "loss": 2.6757137775421143, "step": 1134 }, { "epoch": 0.03606349206349206, "grad_norm": 0.796875, "learning_rate": 0.1, "loss": 2.681288480758667, "step": 1136 }, { "epoch": 0.036126984126984125, "grad_norm": 0.7109375, "learning_rate": 0.1, "loss": 2.692081928253174, "step": 1138 }, { "epoch": 0.03619047619047619, "grad_norm": 0.703125, "learning_rate": 0.1, "loss": 2.6748154163360596, "step": 1140 }, { "epoch": 0.036253968253968254, "grad_norm": 0.56640625, "learning_rate": 0.1, "loss": 2.6559865474700928, "step": 1142 }, { "epoch": 0.03631746031746032, "grad_norm": 0.50390625, "learning_rate": 0.1, "loss": 2.688525676727295, "step": 1144 }, { "epoch": 0.03638095238095238, "grad_norm": 0.5078125, "learning_rate": 0.1, "loss": 2.6724460124969482, "step": 1146 }, { "epoch": 0.036444444444444446, "grad_norm": 0.1279296875, "learning_rate": 0.1, "loss": 2.6787240505218506, "step": 1148 }, { "epoch": 0.03650793650793651, "grad_norm": 0.369140625, "learning_rate": 0.1, "loss": 2.6724512577056885, "step": 1150 }, { "epoch": 0.036571428571428574, "grad_norm": 0.83984375, "learning_rate": 0.1, "loss": 2.6712300777435303, "step": 1152 }, { "epoch": 0.03663492063492064, "grad_norm": 0.72265625, "learning_rate": 0.1, "loss": 2.683469533920288, "step": 1154 }, { "epoch": 0.036698412698412695, "grad_norm": 0.7109375, "learning_rate": 0.1, "loss": 2.6536800861358643, "step": 1156 }, { "epoch": 0.03676190476190476, "grad_norm": 0.64453125, "learning_rate": 0.1, "loss": 2.661208391189575, "step": 1158 }, { "epoch": 0.036825396825396824, "grad_norm": 0.63671875, "learning_rate": 0.1, "loss": 2.643231153488159, "step": 1160 }, { "epoch": 0.03688888888888889, "grad_norm": 0.5625, "learning_rate": 0.1, "loss": 2.635059118270874, "step": 1162 }, { "epoch": 0.03695238095238095, "grad_norm": 0.40234375, "learning_rate": 0.1, "loss": 2.6533446311950684, "step": 1164 }, { "epoch": 0.037015873015873016, "grad_norm": 0.349609375, "learning_rate": 0.1, "loss": 2.6570746898651123, "step": 1166 }, { "epoch": 0.03707936507936508, "grad_norm": 0.474609375, "learning_rate": 0.1, "loss": 2.672158718109131, "step": 1168 }, { "epoch": 0.037142857142857144, "grad_norm": 0.353515625, "learning_rate": 0.1, "loss": 2.660187005996704, "step": 1170 }, { "epoch": 0.03720634920634921, "grad_norm": 0.212890625, "learning_rate": 0.1, "loss": 2.642498016357422, "step": 1172 }, { "epoch": 0.03726984126984127, "grad_norm": 0.162109375, "learning_rate": 0.1, "loss": 2.658662796020508, "step": 1174 }, { "epoch": 0.037333333333333336, "grad_norm": 0.2119140625, "learning_rate": 0.1, "loss": 2.666717529296875, "step": 1176 }, { "epoch": 0.037396825396825394, "grad_norm": 0.421875, "learning_rate": 0.1, "loss": 2.6217916011810303, "step": 1178 }, { "epoch": 0.03746031746031746, "grad_norm": 0.5703125, "learning_rate": 0.1, "loss": 2.6821365356445312, "step": 1180 }, { "epoch": 0.03752380952380952, "grad_norm": 0.5703125, "learning_rate": 0.1, "loss": 2.646242141723633, "step": 1182 }, { "epoch": 0.037587301587301586, "grad_norm": 0.515625, "learning_rate": 0.1, "loss": 2.6599268913269043, "step": 1184 }, { "epoch": 0.03765079365079365, "grad_norm": 0.55859375, "learning_rate": 0.1, "loss": 2.6795706748962402, "step": 1186 }, { "epoch": 0.037714285714285714, "grad_norm": 0.46875, "learning_rate": 0.1, "loss": 2.6538643836975098, "step": 1188 }, { "epoch": 0.03777777777777778, "grad_norm": 0.51953125, "learning_rate": 0.1, "loss": 2.6574606895446777, "step": 1190 }, { "epoch": 0.03784126984126984, "grad_norm": 0.291015625, "learning_rate": 0.1, "loss": 2.696505546569824, "step": 1192 }, { "epoch": 0.037904761904761906, "grad_norm": 0.333984375, "learning_rate": 0.1, "loss": 2.6766324043273926, "step": 1194 }, { "epoch": 0.03796825396825397, "grad_norm": 0.29296875, "learning_rate": 0.1, "loss": 2.6542508602142334, "step": 1196 }, { "epoch": 0.038031746031746035, "grad_norm": 0.5234375, "learning_rate": 0.1, "loss": 2.6598339080810547, "step": 1198 }, { "epoch": 0.0380952380952381, "grad_norm": 0.69921875, "learning_rate": 0.1, "loss": 2.656649112701416, "step": 1200 }, { "epoch": 0.038158730158730156, "grad_norm": 0.5390625, "learning_rate": 0.1, "loss": 2.6448447704315186, "step": 1202 }, { "epoch": 0.03822222222222222, "grad_norm": 0.498046875, "learning_rate": 0.1, "loss": 2.692906379699707, "step": 1204 }, { "epoch": 0.038285714285714284, "grad_norm": 0.431640625, "learning_rate": 0.1, "loss": 2.665818691253662, "step": 1206 }, { "epoch": 0.03834920634920635, "grad_norm": 0.388671875, "learning_rate": 0.1, "loss": 2.635230779647827, "step": 1208 }, { "epoch": 0.03841269841269841, "grad_norm": 0.375, "learning_rate": 0.1, "loss": 2.642791986465454, "step": 1210 }, { "epoch": 0.038476190476190476, "grad_norm": 0.345703125, "learning_rate": 0.1, "loss": 2.6515157222747803, "step": 1212 }, { "epoch": 0.03853968253968254, "grad_norm": 0.337890625, "learning_rate": 0.1, "loss": 2.663710117340088, "step": 1214 }, { "epoch": 0.038603174603174605, "grad_norm": 0.416015625, "learning_rate": 0.1, "loss": 2.6322927474975586, "step": 1216 }, { "epoch": 0.03866666666666667, "grad_norm": 0.255859375, "learning_rate": 0.1, "loss": 2.630251407623291, "step": 1218 }, { "epoch": 0.03873015873015873, "grad_norm": 0.150390625, "learning_rate": 0.1, "loss": 2.6605069637298584, "step": 1220 }, { "epoch": 0.0387936507936508, "grad_norm": 0.1064453125, "learning_rate": 0.1, "loss": 2.655956268310547, "step": 1222 }, { "epoch": 0.038857142857142854, "grad_norm": 0.12255859375, "learning_rate": 0.1, "loss": 2.6126139163970947, "step": 1224 }, { "epoch": 0.03892063492063492, "grad_norm": 0.357421875, "learning_rate": 0.1, "loss": 2.6546597480773926, "step": 1226 }, { "epoch": 0.03898412698412698, "grad_norm": 0.5625, "learning_rate": 0.1, "loss": 2.638777732849121, "step": 1228 }, { "epoch": 0.039047619047619046, "grad_norm": 0.61328125, "learning_rate": 0.1, "loss": 2.637145519256592, "step": 1230 }, { "epoch": 0.03911111111111111, "grad_norm": 0.58984375, "learning_rate": 0.1, "loss": 2.6693761348724365, "step": 1232 }, { "epoch": 0.039174603174603174, "grad_norm": 0.3359375, "learning_rate": 0.1, "loss": 2.658592462539673, "step": 1234 }, { "epoch": 0.03923809523809524, "grad_norm": 0.3359375, "learning_rate": 0.1, "loss": 2.6623897552490234, "step": 1236 }, { "epoch": 0.0393015873015873, "grad_norm": 0.40234375, "learning_rate": 0.1, "loss": 2.647751808166504, "step": 1238 }, { "epoch": 0.03936507936507937, "grad_norm": 0.2001953125, "learning_rate": 0.1, "loss": 2.641871452331543, "step": 1240 }, { "epoch": 0.03942857142857143, "grad_norm": 0.08349609375, "learning_rate": 0.1, "loss": 2.6236391067504883, "step": 1242 }, { "epoch": 0.039492063492063495, "grad_norm": 0.11083984375, "learning_rate": 0.1, "loss": 2.641286611557007, "step": 1244 }, { "epoch": 0.03955555555555555, "grad_norm": 0.19140625, "learning_rate": 0.1, "loss": 2.6621077060699463, "step": 1246 }, { "epoch": 0.039619047619047616, "grad_norm": 0.259765625, "learning_rate": 0.1, "loss": 2.6618478298187256, "step": 1248 }, { "epoch": 0.03968253968253968, "grad_norm": 0.314453125, "learning_rate": 0.1, "loss": 2.642731189727783, "step": 1250 }, { "epoch": 0.039746031746031744, "grad_norm": 0.474609375, "learning_rate": 0.1, "loss": 2.651308536529541, "step": 1252 }, { "epoch": 0.03980952380952381, "grad_norm": 0.341796875, "learning_rate": 0.1, "loss": 2.629499912261963, "step": 1254 }, { "epoch": 0.03987301587301587, "grad_norm": 0.21875, "learning_rate": 0.1, "loss": 2.6663670539855957, "step": 1256 }, { "epoch": 0.03993650793650794, "grad_norm": 0.232421875, "learning_rate": 0.1, "loss": 2.679774522781372, "step": 1258 }, { "epoch": 0.04, "grad_norm": 0.181640625, "learning_rate": 0.1, "loss": 2.6271841526031494, "step": 1260 }, { "epoch": 0.040063492063492065, "grad_norm": 0.13671875, "learning_rate": 0.1, "loss": 2.67289400100708, "step": 1262 }, { "epoch": 0.04012698412698413, "grad_norm": 0.302734375, "learning_rate": 0.1, "loss": 2.667947769165039, "step": 1264 }, { "epoch": 0.04019047619047619, "grad_norm": 0.357421875, "learning_rate": 0.1, "loss": 2.6563620567321777, "step": 1266 }, { "epoch": 0.04025396825396826, "grad_norm": 0.13671875, "learning_rate": 0.1, "loss": 2.6362013816833496, "step": 1268 }, { "epoch": 0.040317460317460314, "grad_norm": 0.1142578125, "learning_rate": 0.1, "loss": 2.6524970531463623, "step": 1270 }, { "epoch": 0.04038095238095238, "grad_norm": 0.1416015625, "learning_rate": 0.1, "loss": 2.663745880126953, "step": 1272 }, { "epoch": 0.04044444444444444, "grad_norm": 0.302734375, "learning_rate": 0.1, "loss": 2.67185115814209, "step": 1274 }, { "epoch": 0.04050793650793651, "grad_norm": 0.41015625, "learning_rate": 0.1, "loss": 2.671518325805664, "step": 1276 }, { "epoch": 0.04057142857142857, "grad_norm": 0.27734375, "learning_rate": 0.1, "loss": 2.6619322299957275, "step": 1278 }, { "epoch": 0.040634920634920635, "grad_norm": 0.26171875, "learning_rate": 0.1, "loss": 2.68058180809021, "step": 1280 }, { "epoch": 0.0406984126984127, "grad_norm": 0.171875, "learning_rate": 0.1, "loss": 2.633430242538452, "step": 1282 }, { "epoch": 0.04076190476190476, "grad_norm": 0.283203125, "learning_rate": 0.1, "loss": 2.699065923690796, "step": 1284 }, { "epoch": 0.04082539682539683, "grad_norm": 0.4140625, "learning_rate": 0.1, "loss": 2.6409966945648193, "step": 1286 }, { "epoch": 0.04088888888888889, "grad_norm": 0.3125, "learning_rate": 0.1, "loss": 2.6681580543518066, "step": 1288 }, { "epoch": 0.040952380952380955, "grad_norm": 0.181640625, "learning_rate": 0.1, "loss": 2.650306463241577, "step": 1290 }, { "epoch": 0.04101587301587301, "grad_norm": 0.267578125, "learning_rate": 0.1, "loss": 2.647355318069458, "step": 1292 }, { "epoch": 0.04107936507936508, "grad_norm": 0.173828125, "learning_rate": 0.1, "loss": 2.6431291103363037, "step": 1294 }, { "epoch": 0.04114285714285714, "grad_norm": 0.126953125, "learning_rate": 0.1, "loss": 2.6549479961395264, "step": 1296 }, { "epoch": 0.041206349206349205, "grad_norm": 0.1767578125, "learning_rate": 0.1, "loss": 2.664788246154785, "step": 1298 }, { "epoch": 0.04126984126984127, "grad_norm": 0.13671875, "learning_rate": 0.1, "loss": 2.6350059509277344, "step": 1300 }, { "epoch": 0.04133333333333333, "grad_norm": 0.1025390625, "learning_rate": 0.1, "loss": 2.633406162261963, "step": 1302 }, { "epoch": 0.0413968253968254, "grad_norm": 0.109375, "learning_rate": 0.1, "loss": 2.6360270977020264, "step": 1304 }, { "epoch": 0.04146031746031746, "grad_norm": 0.1845703125, "learning_rate": 0.1, "loss": 2.6345574855804443, "step": 1306 }, { "epoch": 0.041523809523809525, "grad_norm": 0.3671875, "learning_rate": 0.1, "loss": 2.6265199184417725, "step": 1308 }, { "epoch": 0.04158730158730159, "grad_norm": 0.171875, "learning_rate": 0.1, "loss": 2.623105764389038, "step": 1310 }, { "epoch": 0.041650793650793654, "grad_norm": 0.337890625, "learning_rate": 0.1, "loss": 2.623866319656372, "step": 1312 }, { "epoch": 0.04171428571428572, "grad_norm": 0.44921875, "learning_rate": 0.1, "loss": 2.6534759998321533, "step": 1314 }, { "epoch": 0.041777777777777775, "grad_norm": 0.134765625, "learning_rate": 0.1, "loss": 2.657745599746704, "step": 1316 }, { "epoch": 0.04184126984126984, "grad_norm": 0.1552734375, "learning_rate": 0.1, "loss": 2.6699578762054443, "step": 1318 }, { "epoch": 0.0419047619047619, "grad_norm": 0.1044921875, "learning_rate": 0.1, "loss": 2.6243934631347656, "step": 1320 }, { "epoch": 0.04196825396825397, "grad_norm": 0.1376953125, "learning_rate": 0.1, "loss": 2.6332671642303467, "step": 1322 }, { "epoch": 0.04203174603174603, "grad_norm": 0.2451171875, "learning_rate": 0.1, "loss": 2.6694743633270264, "step": 1324 }, { "epoch": 0.042095238095238095, "grad_norm": 0.1279296875, "learning_rate": 0.1, "loss": 2.649601936340332, "step": 1326 }, { "epoch": 0.04215873015873016, "grad_norm": 0.103515625, "learning_rate": 0.1, "loss": 2.6271846294403076, "step": 1328 }, { "epoch": 0.042222222222222223, "grad_norm": 0.1318359375, "learning_rate": 0.1, "loss": 2.618587017059326, "step": 1330 }, { "epoch": 0.04228571428571429, "grad_norm": 0.30078125, "learning_rate": 0.1, "loss": 2.655590295791626, "step": 1332 }, { "epoch": 0.04234920634920635, "grad_norm": 0.373046875, "learning_rate": 0.1, "loss": 2.634373188018799, "step": 1334 }, { "epoch": 0.042412698412698416, "grad_norm": 0.35546875, "learning_rate": 0.1, "loss": 2.633297920227051, "step": 1336 }, { "epoch": 0.04247619047619047, "grad_norm": 0.1806640625, "learning_rate": 0.1, "loss": 2.6059844493865967, "step": 1338 }, { "epoch": 0.04253968253968254, "grad_norm": 0.1064453125, "learning_rate": 0.1, "loss": 2.6066677570343018, "step": 1340 }, { "epoch": 0.0426031746031746, "grad_norm": 0.169921875, "learning_rate": 0.1, "loss": 2.6271371841430664, "step": 1342 }, { "epoch": 0.042666666666666665, "grad_norm": 0.205078125, "learning_rate": 0.1, "loss": 2.6134698390960693, "step": 1344 }, { "epoch": 0.04273015873015873, "grad_norm": 0.2431640625, "learning_rate": 0.1, "loss": 2.637251615524292, "step": 1346 }, { "epoch": 0.04279365079365079, "grad_norm": 0.2158203125, "learning_rate": 0.1, "loss": 2.621696949005127, "step": 1348 }, { "epoch": 0.04285714285714286, "grad_norm": 0.1220703125, "learning_rate": 0.1, "loss": 2.6532161235809326, "step": 1350 }, { "epoch": 0.04292063492063492, "grad_norm": 0.390625, "learning_rate": 0.1, "loss": 2.6323459148406982, "step": 1352 }, { "epoch": 0.042984126984126986, "grad_norm": 0.306640625, "learning_rate": 0.1, "loss": 2.593425989151001, "step": 1354 }, { "epoch": 0.04304761904761905, "grad_norm": 0.154296875, "learning_rate": 0.1, "loss": 2.6211507320404053, "step": 1356 }, { "epoch": 0.043111111111111114, "grad_norm": 0.18359375, "learning_rate": 0.1, "loss": 2.605872392654419, "step": 1358 }, { "epoch": 0.04317460317460318, "grad_norm": 0.1708984375, "learning_rate": 0.1, "loss": 2.594703435897827, "step": 1360 }, { "epoch": 0.043238095238095235, "grad_norm": 0.265625, "learning_rate": 0.1, "loss": 2.6084237098693848, "step": 1362 }, { "epoch": 0.0433015873015873, "grad_norm": 0.169921875, "learning_rate": 0.1, "loss": 2.5925495624542236, "step": 1364 }, { "epoch": 0.04336507936507936, "grad_norm": 0.1259765625, "learning_rate": 0.1, "loss": 2.607645273208618, "step": 1366 }, { "epoch": 0.04342857142857143, "grad_norm": 0.1591796875, "learning_rate": 0.1, "loss": 2.6350653171539307, "step": 1368 }, { "epoch": 0.04349206349206349, "grad_norm": 0.30078125, "learning_rate": 0.1, "loss": 2.615772247314453, "step": 1370 }, { "epoch": 0.043555555555555556, "grad_norm": 0.306640625, "learning_rate": 0.1, "loss": 2.6255178451538086, "step": 1372 }, { "epoch": 0.04361904761904762, "grad_norm": 0.2021484375, "learning_rate": 0.1, "loss": 2.621572971343994, "step": 1374 }, { "epoch": 0.043682539682539684, "grad_norm": 0.3359375, "learning_rate": 0.1, "loss": 2.6190061569213867, "step": 1376 }, { "epoch": 0.04374603174603175, "grad_norm": 0.4296875, "learning_rate": 0.1, "loss": 2.6373236179351807, "step": 1378 }, { "epoch": 0.04380952380952381, "grad_norm": 0.1611328125, "learning_rate": 0.1, "loss": 2.617990493774414, "step": 1380 }, { "epoch": 0.043873015873015876, "grad_norm": 0.498046875, "learning_rate": 0.1, "loss": 2.598693609237671, "step": 1382 }, { "epoch": 0.04393650793650793, "grad_norm": 0.41796875, "learning_rate": 0.1, "loss": 2.61950945854187, "step": 1384 }, { "epoch": 0.044, "grad_norm": 0.1904296875, "learning_rate": 0.1, "loss": 2.5696895122528076, "step": 1386 }, { "epoch": 0.04406349206349206, "grad_norm": 0.134765625, "learning_rate": 0.1, "loss": 2.6181018352508545, "step": 1388 }, { "epoch": 0.044126984126984126, "grad_norm": 0.1435546875, "learning_rate": 0.1, "loss": 2.6094446182250977, "step": 1390 }, { "epoch": 0.04419047619047619, "grad_norm": 0.142578125, "learning_rate": 0.1, "loss": 2.6038010120391846, "step": 1392 }, { "epoch": 0.044253968253968254, "grad_norm": 0.2109375, "learning_rate": 0.1, "loss": 2.5856547355651855, "step": 1394 }, { "epoch": 0.04431746031746032, "grad_norm": 0.1923828125, "learning_rate": 0.1, "loss": 2.6262505054473877, "step": 1396 }, { "epoch": 0.04438095238095238, "grad_norm": 0.146484375, "learning_rate": 0.1, "loss": 2.610076427459717, "step": 1398 }, { "epoch": 0.044444444444444446, "grad_norm": 0.13671875, "learning_rate": 0.1, "loss": 2.6102683544158936, "step": 1400 }, { "epoch": 0.04450793650793651, "grad_norm": 0.1796875, "learning_rate": 0.1, "loss": 2.6243345737457275, "step": 1402 }, { "epoch": 0.044571428571428574, "grad_norm": 0.197265625, "learning_rate": 0.1, "loss": 2.6047377586364746, "step": 1404 }, { "epoch": 0.04463492063492063, "grad_norm": 0.3359375, "learning_rate": 0.1, "loss": 2.6103434562683105, "step": 1406 }, { "epoch": 0.044698412698412696, "grad_norm": 0.4140625, "learning_rate": 0.1, "loss": 2.6175785064697266, "step": 1408 }, { "epoch": 0.04476190476190476, "grad_norm": 0.1181640625, "learning_rate": 0.1, "loss": 2.5673089027404785, "step": 1410 }, { "epoch": 0.044825396825396824, "grad_norm": 0.1923828125, "learning_rate": 0.1, "loss": 2.6276071071624756, "step": 1412 }, { "epoch": 0.04488888888888889, "grad_norm": 0.361328125, "learning_rate": 0.1, "loss": 2.605919599533081, "step": 1414 }, { "epoch": 0.04495238095238095, "grad_norm": 0.1572265625, "learning_rate": 0.1, "loss": 2.5857460498809814, "step": 1416 }, { "epoch": 0.045015873015873016, "grad_norm": 0.1591796875, "learning_rate": 0.1, "loss": 2.5984723567962646, "step": 1418 }, { "epoch": 0.04507936507936508, "grad_norm": 0.490234375, "learning_rate": 0.1, "loss": 2.6093602180480957, "step": 1420 }, { "epoch": 0.045142857142857144, "grad_norm": 0.3671875, "learning_rate": 0.1, "loss": 2.585080146789551, "step": 1422 }, { "epoch": 0.04520634920634921, "grad_norm": 0.150390625, "learning_rate": 0.1, "loss": 2.5962133407592773, "step": 1424 }, { "epoch": 0.04526984126984127, "grad_norm": 0.232421875, "learning_rate": 0.1, "loss": 2.595618486404419, "step": 1426 }, { "epoch": 0.04533333333333334, "grad_norm": 0.3515625, "learning_rate": 0.1, "loss": 2.585345506668091, "step": 1428 }, { "epoch": 0.045396825396825394, "grad_norm": 0.171875, "learning_rate": 0.1, "loss": 2.574321985244751, "step": 1430 }, { "epoch": 0.04546031746031746, "grad_norm": 0.2275390625, "learning_rate": 0.1, "loss": 2.619889974594116, "step": 1432 }, { "epoch": 0.04552380952380952, "grad_norm": 0.271484375, "learning_rate": 0.1, "loss": 2.5779075622558594, "step": 1434 }, { "epoch": 0.045587301587301586, "grad_norm": 0.08740234375, "learning_rate": 0.1, "loss": 2.592456817626953, "step": 1436 }, { "epoch": 0.04565079365079365, "grad_norm": 0.2041015625, "learning_rate": 0.1, "loss": 2.5951809883117676, "step": 1438 }, { "epoch": 0.045714285714285714, "grad_norm": 0.23828125, "learning_rate": 0.1, "loss": 2.5742359161376953, "step": 1440 }, { "epoch": 0.04577777777777778, "grad_norm": 0.1669921875, "learning_rate": 0.1, "loss": 2.5701749324798584, "step": 1442 }, { "epoch": 0.04584126984126984, "grad_norm": 0.2392578125, "learning_rate": 0.1, "loss": 2.5904343128204346, "step": 1444 }, { "epoch": 0.04590476190476191, "grad_norm": 0.255859375, "learning_rate": 0.1, "loss": 2.5817737579345703, "step": 1446 }, { "epoch": 0.04596825396825397, "grad_norm": 0.279296875, "learning_rate": 0.1, "loss": 2.593200206756592, "step": 1448 }, { "epoch": 0.046031746031746035, "grad_norm": 0.390625, "learning_rate": 0.1, "loss": 2.5693047046661377, "step": 1450 }, { "epoch": 0.04609523809523809, "grad_norm": 0.453125, "learning_rate": 0.1, "loss": 2.6022086143493652, "step": 1452 }, { "epoch": 0.046158730158730156, "grad_norm": 0.1357421875, "learning_rate": 0.1, "loss": 2.5931637287139893, "step": 1454 }, { "epoch": 0.04622222222222222, "grad_norm": 0.2578125, "learning_rate": 0.1, "loss": 2.5610291957855225, "step": 1456 }, { "epoch": 0.046285714285714284, "grad_norm": 0.3046875, "learning_rate": 0.1, "loss": 2.593970775604248, "step": 1458 }, { "epoch": 0.04634920634920635, "grad_norm": 0.1748046875, "learning_rate": 0.1, "loss": 2.6248795986175537, "step": 1460 }, { "epoch": 0.04641269841269841, "grad_norm": 0.109375, "learning_rate": 0.1, "loss": 2.575120210647583, "step": 1462 }, { "epoch": 0.046476190476190476, "grad_norm": 0.2578125, "learning_rate": 0.1, "loss": 2.572310209274292, "step": 1464 }, { "epoch": 0.04653968253968254, "grad_norm": 0.38671875, "learning_rate": 0.1, "loss": 2.5621745586395264, "step": 1466 }, { "epoch": 0.046603174603174605, "grad_norm": 0.2451171875, "learning_rate": 0.1, "loss": 2.5607662200927734, "step": 1468 }, { "epoch": 0.04666666666666667, "grad_norm": 0.1435546875, "learning_rate": 0.1, "loss": 2.563286304473877, "step": 1470 }, { "epoch": 0.04673015873015873, "grad_norm": 0.1357421875, "learning_rate": 0.1, "loss": 2.6046438217163086, "step": 1472 }, { "epoch": 0.0467936507936508, "grad_norm": 0.1005859375, "learning_rate": 0.1, "loss": 2.614341974258423, "step": 1474 }, { "epoch": 0.046857142857142854, "grad_norm": 0.1005859375, "learning_rate": 0.1, "loss": 2.5632169246673584, "step": 1476 }, { "epoch": 0.04692063492063492, "grad_norm": 0.11669921875, "learning_rate": 0.1, "loss": 2.582859516143799, "step": 1478 }, { "epoch": 0.04698412698412698, "grad_norm": 0.208984375, "learning_rate": 0.1, "loss": 2.585080623626709, "step": 1480 }, { "epoch": 0.047047619047619046, "grad_norm": 0.25, "learning_rate": 0.1, "loss": 2.5611844062805176, "step": 1482 }, { "epoch": 0.04711111111111111, "grad_norm": 0.4140625, "learning_rate": 0.1, "loss": 2.5972559452056885, "step": 1484 }, { "epoch": 0.047174603174603175, "grad_norm": 0.2373046875, "learning_rate": 0.1, "loss": 2.5783493518829346, "step": 1486 }, { "epoch": 0.04723809523809524, "grad_norm": 0.2060546875, "learning_rate": 0.1, "loss": 2.579458475112915, "step": 1488 }, { "epoch": 0.0473015873015873, "grad_norm": 0.263671875, "learning_rate": 0.1, "loss": 2.5805931091308594, "step": 1490 }, { "epoch": 0.04736507936507937, "grad_norm": 0.138671875, "learning_rate": 0.1, "loss": 2.5763728618621826, "step": 1492 }, { "epoch": 0.04742857142857143, "grad_norm": 0.10400390625, "learning_rate": 0.1, "loss": 2.5762224197387695, "step": 1494 }, { "epoch": 0.047492063492063495, "grad_norm": 0.11572265625, "learning_rate": 0.1, "loss": 2.5989184379577637, "step": 1496 }, { "epoch": 0.04755555555555555, "grad_norm": 0.1845703125, "learning_rate": 0.1, "loss": 2.585195779800415, "step": 1498 }, { "epoch": 0.047619047619047616, "grad_norm": 0.5390625, "learning_rate": 0.1, "loss": 2.564549684524536, "step": 1500 }, { "epoch": 0.04768253968253968, "grad_norm": 0.578125, "learning_rate": 0.1, "loss": 2.5761020183563232, "step": 1502 }, { "epoch": 0.047746031746031745, "grad_norm": 0.10009765625, "learning_rate": 0.1, "loss": 2.5476179122924805, "step": 1504 }, { "epoch": 0.04780952380952381, "grad_norm": 0.1533203125, "learning_rate": 0.1, "loss": 2.599600076675415, "step": 1506 }, { "epoch": 0.04787301587301587, "grad_norm": 0.1474609375, "learning_rate": 0.1, "loss": 2.567418336868286, "step": 1508 }, { "epoch": 0.04793650793650794, "grad_norm": 0.2119140625, "learning_rate": 0.1, "loss": 2.584707021713257, "step": 1510 }, { "epoch": 0.048, "grad_norm": 0.1279296875, "learning_rate": 0.1, "loss": 2.5879013538360596, "step": 1512 }, { "epoch": 0.048063492063492065, "grad_norm": 0.12451171875, "learning_rate": 0.1, "loss": 2.566744565963745, "step": 1514 }, { "epoch": 0.04812698412698413, "grad_norm": 0.11767578125, "learning_rate": 0.1, "loss": 2.5640485286712646, "step": 1516 }, { "epoch": 0.04819047619047619, "grad_norm": 0.1533203125, "learning_rate": 0.1, "loss": 2.570289134979248, "step": 1518 }, { "epoch": 0.04825396825396826, "grad_norm": 0.51953125, "learning_rate": 0.1, "loss": 2.595640182495117, "step": 1520 }, { "epoch": 0.048317460317460315, "grad_norm": 0.419921875, "learning_rate": 0.1, "loss": 2.567457914352417, "step": 1522 }, { "epoch": 0.04838095238095238, "grad_norm": 0.4921875, "learning_rate": 0.1, "loss": 2.581139326095581, "step": 1524 }, { "epoch": 0.04844444444444444, "grad_norm": 0.134765625, "learning_rate": 0.1, "loss": 2.572601079940796, "step": 1526 }, { "epoch": 0.04850793650793651, "grad_norm": 0.2080078125, "learning_rate": 0.1, "loss": 2.558884620666504, "step": 1528 }, { "epoch": 0.04857142857142857, "grad_norm": 0.1611328125, "learning_rate": 0.1, "loss": 2.5437374114990234, "step": 1530 }, { "epoch": 0.048634920634920635, "grad_norm": 0.09033203125, "learning_rate": 0.1, "loss": 2.577623128890991, "step": 1532 }, { "epoch": 0.0486984126984127, "grad_norm": 0.1533203125, "learning_rate": 0.1, "loss": 2.564507484436035, "step": 1534 }, { "epoch": 0.04876190476190476, "grad_norm": 0.173828125, "learning_rate": 0.1, "loss": 2.5699408054351807, "step": 1536 }, { "epoch": 0.04882539682539683, "grad_norm": 0.228515625, "learning_rate": 0.1, "loss": 2.55897855758667, "step": 1538 }, { "epoch": 0.04888888888888889, "grad_norm": 0.1806640625, "learning_rate": 0.1, "loss": 2.5800466537475586, "step": 1540 }, { "epoch": 0.048952380952380956, "grad_norm": 0.1025390625, "learning_rate": 0.1, "loss": 2.548694610595703, "step": 1542 }, { "epoch": 0.04901587301587301, "grad_norm": 0.255859375, "learning_rate": 0.1, "loss": 2.5570762157440186, "step": 1544 }, { "epoch": 0.04907936507936508, "grad_norm": 0.271484375, "learning_rate": 0.1, "loss": 2.5677740573883057, "step": 1546 }, { "epoch": 0.04914285714285714, "grad_norm": 0.1630859375, "learning_rate": 0.1, "loss": 2.5769152641296387, "step": 1548 }, { "epoch": 0.049206349206349205, "grad_norm": 0.328125, "learning_rate": 0.1, "loss": 2.554226875305176, "step": 1550 }, { "epoch": 0.04926984126984127, "grad_norm": 0.263671875, "learning_rate": 0.1, "loss": 2.578450918197632, "step": 1552 }, { "epoch": 0.04933333333333333, "grad_norm": 0.08544921875, "learning_rate": 0.1, "loss": 2.55303692817688, "step": 1554 }, { "epoch": 0.0493968253968254, "grad_norm": 0.08837890625, "learning_rate": 0.1, "loss": 2.560556411743164, "step": 1556 }, { "epoch": 0.04946031746031746, "grad_norm": 0.31640625, "learning_rate": 0.1, "loss": 2.569547414779663, "step": 1558 }, { "epoch": 0.049523809523809526, "grad_norm": 0.314453125, "learning_rate": 0.1, "loss": 2.554215908050537, "step": 1560 }, { "epoch": 0.04958730158730159, "grad_norm": 0.2578125, "learning_rate": 0.1, "loss": 2.5629169940948486, "step": 1562 }, { "epoch": 0.049650793650793654, "grad_norm": 0.130859375, "learning_rate": 0.1, "loss": 2.5520761013031006, "step": 1564 }, { "epoch": 0.04971428571428571, "grad_norm": 0.1474609375, "learning_rate": 0.1, "loss": 2.5472068786621094, "step": 1566 }, { "epoch": 0.049777777777777775, "grad_norm": 0.3671875, "learning_rate": 0.1, "loss": 2.5570318698883057, "step": 1568 }, { "epoch": 0.04984126984126984, "grad_norm": 0.29296875, "learning_rate": 0.1, "loss": 2.578866958618164, "step": 1570 }, { "epoch": 0.0499047619047619, "grad_norm": 0.16015625, "learning_rate": 0.1, "loss": 2.54788875579834, "step": 1572 }, { "epoch": 0.04996825396825397, "grad_norm": 0.322265625, "learning_rate": 0.1, "loss": 2.549659490585327, "step": 1574 }, { "epoch": 0.05003174603174603, "grad_norm": 0.37890625, "learning_rate": 0.1, "loss": 2.5682427883148193, "step": 1576 }, { "epoch": 0.050095238095238095, "grad_norm": 0.365234375, "learning_rate": 0.1, "loss": 2.550196647644043, "step": 1578 }, { "epoch": 0.05015873015873016, "grad_norm": 0.154296875, "learning_rate": 0.1, "loss": 2.5718095302581787, "step": 1580 }, { "epoch": 0.050222222222222224, "grad_norm": 0.16796875, "learning_rate": 0.1, "loss": 2.5673797130584717, "step": 1582 }, { "epoch": 0.05028571428571429, "grad_norm": 0.09619140625, "learning_rate": 0.1, "loss": 2.5521466732025146, "step": 1584 }, { "epoch": 0.05034920634920635, "grad_norm": 0.1474609375, "learning_rate": 0.1, "loss": 2.577080488204956, "step": 1586 }, { "epoch": 0.050412698412698416, "grad_norm": 0.341796875, "learning_rate": 0.1, "loss": 2.540980339050293, "step": 1588 }, { "epoch": 0.05047619047619047, "grad_norm": 0.4375, "learning_rate": 0.1, "loss": 2.563969612121582, "step": 1590 }, { "epoch": 0.05053968253968254, "grad_norm": 0.1484375, "learning_rate": 0.1, "loss": 2.538367986679077, "step": 1592 }, { "epoch": 0.0506031746031746, "grad_norm": 0.09033203125, "learning_rate": 0.1, "loss": 2.5723178386688232, "step": 1594 }, { "epoch": 0.050666666666666665, "grad_norm": 0.12451171875, "learning_rate": 0.1, "loss": 2.542020320892334, "step": 1596 }, { "epoch": 0.05073015873015873, "grad_norm": 0.10595703125, "learning_rate": 0.1, "loss": 2.55751371383667, "step": 1598 }, { "epoch": 0.050793650793650794, "grad_norm": 0.234375, "learning_rate": 0.1, "loss": 2.5805773735046387, "step": 1600 }, { "epoch": 0.05085714285714286, "grad_norm": 0.205078125, "learning_rate": 0.1, "loss": 2.5389583110809326, "step": 1602 }, { "epoch": 0.05092063492063492, "grad_norm": 0.310546875, "learning_rate": 0.1, "loss": 2.5776243209838867, "step": 1604 }, { "epoch": 0.050984126984126986, "grad_norm": 0.388671875, "learning_rate": 0.1, "loss": 2.566572904586792, "step": 1606 }, { "epoch": 0.05104761904761905, "grad_norm": 0.22265625, "learning_rate": 0.1, "loss": 2.557096481323242, "step": 1608 }, { "epoch": 0.051111111111111114, "grad_norm": 0.1015625, "learning_rate": 0.1, "loss": 2.587430477142334, "step": 1610 }, { "epoch": 0.05117460317460317, "grad_norm": 0.19921875, "learning_rate": 0.1, "loss": 2.559907913208008, "step": 1612 }, { "epoch": 0.051238095238095235, "grad_norm": 0.349609375, "learning_rate": 0.1, "loss": 2.587547540664673, "step": 1614 }, { "epoch": 0.0513015873015873, "grad_norm": 0.47265625, "learning_rate": 0.1, "loss": 2.547322988510132, "step": 1616 }, { "epoch": 0.051365079365079364, "grad_norm": 0.125, "learning_rate": 0.1, "loss": 2.5688283443450928, "step": 1618 }, { "epoch": 0.05142857142857143, "grad_norm": 0.263671875, "learning_rate": 0.1, "loss": 2.5672173500061035, "step": 1620 }, { "epoch": 0.05149206349206349, "grad_norm": 0.296875, "learning_rate": 0.1, "loss": 2.6108238697052, "step": 1622 }, { "epoch": 0.051555555555555556, "grad_norm": 0.38671875, "learning_rate": 0.1, "loss": 2.5794496536254883, "step": 1624 }, { "epoch": 0.05161904761904762, "grad_norm": 0.09130859375, "learning_rate": 0.1, "loss": 2.5545010566711426, "step": 1626 }, { "epoch": 0.051682539682539684, "grad_norm": 0.1083984375, "learning_rate": 0.1, "loss": 2.5686938762664795, "step": 1628 }, { "epoch": 0.05174603174603175, "grad_norm": 0.10009765625, "learning_rate": 0.1, "loss": 2.557025909423828, "step": 1630 }, { "epoch": 0.05180952380952381, "grad_norm": 0.1484375, "learning_rate": 0.1, "loss": 2.565890312194824, "step": 1632 }, { "epoch": 0.051873015873015876, "grad_norm": 0.1103515625, "learning_rate": 0.1, "loss": 2.540130615234375, "step": 1634 }, { "epoch": 0.051936507936507934, "grad_norm": 0.1376953125, "learning_rate": 0.1, "loss": 2.5586814880371094, "step": 1636 }, { "epoch": 0.052, "grad_norm": 0.1962890625, "learning_rate": 0.1, "loss": 2.537118673324585, "step": 1638 }, { "epoch": 0.05206349206349206, "grad_norm": 0.23046875, "learning_rate": 0.1, "loss": 2.568425178527832, "step": 1640 }, { "epoch": 0.052126984126984126, "grad_norm": 0.2314453125, "learning_rate": 0.1, "loss": 2.5794079303741455, "step": 1642 }, { "epoch": 0.05219047619047619, "grad_norm": 0.1689453125, "learning_rate": 0.1, "loss": 2.562795639038086, "step": 1644 }, { "epoch": 0.052253968253968254, "grad_norm": 0.197265625, "learning_rate": 0.1, "loss": 2.5527076721191406, "step": 1646 }, { "epoch": 0.05231746031746032, "grad_norm": 0.203125, "learning_rate": 0.1, "loss": 2.5774025917053223, "step": 1648 }, { "epoch": 0.05238095238095238, "grad_norm": 0.1298828125, "learning_rate": 0.1, "loss": 2.556128978729248, "step": 1650 }, { "epoch": 0.052444444444444446, "grad_norm": 0.66796875, "learning_rate": 0.1, "loss": 2.5720303058624268, "step": 1652 }, { "epoch": 0.05250793650793651, "grad_norm": 0.390625, "learning_rate": 0.1, "loss": 2.5731544494628906, "step": 1654 }, { "epoch": 0.052571428571428575, "grad_norm": 0.244140625, "learning_rate": 0.1, "loss": 2.5455148220062256, "step": 1656 }, { "epoch": 0.05263492063492063, "grad_norm": 0.255859375, "learning_rate": 0.1, "loss": 2.5560178756713867, "step": 1658 }, { "epoch": 0.052698412698412696, "grad_norm": 0.251953125, "learning_rate": 0.1, "loss": 2.5808043479919434, "step": 1660 }, { "epoch": 0.05276190476190476, "grad_norm": 0.205078125, "learning_rate": 0.1, "loss": 2.566513776779175, "step": 1662 }, { "epoch": 0.052825396825396824, "grad_norm": 0.1513671875, "learning_rate": 0.1, "loss": 2.5495171546936035, "step": 1664 }, { "epoch": 0.05288888888888889, "grad_norm": 0.09375, "learning_rate": 0.1, "loss": 2.559464693069458, "step": 1666 }, { "epoch": 0.05295238095238095, "grad_norm": 0.1591796875, "learning_rate": 0.1, "loss": 2.543877363204956, "step": 1668 }, { "epoch": 0.053015873015873016, "grad_norm": 0.28515625, "learning_rate": 0.1, "loss": 2.5371670722961426, "step": 1670 }, { "epoch": 0.05307936507936508, "grad_norm": 0.201171875, "learning_rate": 0.1, "loss": 2.527984380722046, "step": 1672 }, { "epoch": 0.053142857142857144, "grad_norm": 0.0830078125, "learning_rate": 0.1, "loss": 2.579608678817749, "step": 1674 }, { "epoch": 0.05320634920634921, "grad_norm": 0.099609375, "learning_rate": 0.1, "loss": 2.548367977142334, "step": 1676 }, { "epoch": 0.05326984126984127, "grad_norm": 0.1904296875, "learning_rate": 0.1, "loss": 2.5248935222625732, "step": 1678 }, { "epoch": 0.05333333333333334, "grad_norm": 0.1943359375, "learning_rate": 0.1, "loss": 2.570230722427368, "step": 1680 }, { "epoch": 0.053396825396825394, "grad_norm": 0.1396484375, "learning_rate": 0.1, "loss": 2.5938262939453125, "step": 1682 }, { "epoch": 0.05346031746031746, "grad_norm": 0.1435546875, "learning_rate": 0.1, "loss": 2.5727975368499756, "step": 1684 }, { "epoch": 0.05352380952380952, "grad_norm": 0.435546875, "learning_rate": 0.1, "loss": 2.560960292816162, "step": 1686 }, { "epoch": 0.053587301587301586, "grad_norm": 0.60546875, "learning_rate": 0.1, "loss": 2.554786443710327, "step": 1688 }, { "epoch": 0.05365079365079365, "grad_norm": 0.1474609375, "learning_rate": 0.1, "loss": 2.5606701374053955, "step": 1690 }, { "epoch": 0.053714285714285714, "grad_norm": 0.09912109375, "learning_rate": 0.1, "loss": 2.59409499168396, "step": 1692 }, { "epoch": 0.05377777777777778, "grad_norm": 0.322265625, "learning_rate": 0.1, "loss": 2.553870439529419, "step": 1694 }, { "epoch": 0.05384126984126984, "grad_norm": 0.419921875, "learning_rate": 0.1, "loss": 2.5651254653930664, "step": 1696 }, { "epoch": 0.05390476190476191, "grad_norm": 0.3828125, "learning_rate": 0.1, "loss": 2.5733139514923096, "step": 1698 }, { "epoch": 0.05396825396825397, "grad_norm": 0.150390625, "learning_rate": 0.1, "loss": 2.5474166870117188, "step": 1700 }, { "epoch": 0.054031746031746035, "grad_norm": 0.123046875, "learning_rate": 0.1, "loss": 2.550259828567505, "step": 1702 }, { "epoch": 0.05409523809523809, "grad_norm": 0.1396484375, "learning_rate": 0.1, "loss": 2.568990707397461, "step": 1704 }, { "epoch": 0.054158730158730156, "grad_norm": 0.205078125, "learning_rate": 0.1, "loss": 2.5838751792907715, "step": 1706 }, { "epoch": 0.05422222222222222, "grad_norm": 0.1943359375, "learning_rate": 0.1, "loss": 2.596839666366577, "step": 1708 }, { "epoch": 0.054285714285714284, "grad_norm": 0.12255859375, "learning_rate": 0.1, "loss": 2.581559658050537, "step": 1710 }, { "epoch": 0.05434920634920635, "grad_norm": 0.10986328125, "learning_rate": 0.1, "loss": 2.5485291481018066, "step": 1712 }, { "epoch": 0.05441269841269841, "grad_norm": 0.1669921875, "learning_rate": 0.1, "loss": 2.563433885574341, "step": 1714 }, { "epoch": 0.05447619047619048, "grad_norm": 0.24609375, "learning_rate": 0.1, "loss": 2.5516622066497803, "step": 1716 }, { "epoch": 0.05453968253968254, "grad_norm": 0.369140625, "learning_rate": 0.1, "loss": 2.5698862075805664, "step": 1718 }, { "epoch": 0.054603174603174605, "grad_norm": 0.271484375, "learning_rate": 0.1, "loss": 2.5475826263427734, "step": 1720 }, { "epoch": 0.05466666666666667, "grad_norm": 0.1611328125, "learning_rate": 0.1, "loss": 2.5463545322418213, "step": 1722 }, { "epoch": 0.05473015873015873, "grad_norm": 0.25, "learning_rate": 0.1, "loss": 2.575303554534912, "step": 1724 }, { "epoch": 0.05479365079365079, "grad_norm": 0.37109375, "learning_rate": 0.1, "loss": 2.581329822540283, "step": 1726 }, { "epoch": 0.054857142857142854, "grad_norm": 0.228515625, "learning_rate": 0.1, "loss": 2.5549137592315674, "step": 1728 }, { "epoch": 0.05492063492063492, "grad_norm": 0.12060546875, "learning_rate": 0.1, "loss": 2.574509859085083, "step": 1730 }, { "epoch": 0.05498412698412698, "grad_norm": 0.15625, "learning_rate": 0.1, "loss": 2.555715799331665, "step": 1732 }, { "epoch": 0.05504761904761905, "grad_norm": 0.1884765625, "learning_rate": 0.1, "loss": 2.5694868564605713, "step": 1734 }, { "epoch": 0.05511111111111111, "grad_norm": 0.310546875, "learning_rate": 0.1, "loss": 2.5594654083251953, "step": 1736 }, { "epoch": 0.055174603174603175, "grad_norm": 0.298828125, "learning_rate": 0.1, "loss": 2.552854299545288, "step": 1738 }, { "epoch": 0.05523809523809524, "grad_norm": 0.384765625, "learning_rate": 0.1, "loss": 2.5697147846221924, "step": 1740 }, { "epoch": 0.0553015873015873, "grad_norm": 0.212890625, "learning_rate": 0.1, "loss": 2.5553224086761475, "step": 1742 }, { "epoch": 0.05536507936507937, "grad_norm": 0.1669921875, "learning_rate": 0.1, "loss": 2.551034450531006, "step": 1744 }, { "epoch": 0.05542857142857143, "grad_norm": 0.1005859375, "learning_rate": 0.1, "loss": 2.571258544921875, "step": 1746 }, { "epoch": 0.055492063492063495, "grad_norm": 0.1298828125, "learning_rate": 0.1, "loss": 2.5208468437194824, "step": 1748 }, { "epoch": 0.05555555555555555, "grad_norm": 0.490234375, "learning_rate": 0.1, "loss": 2.5568060874938965, "step": 1750 }, { "epoch": 0.05561904761904762, "grad_norm": 0.53515625, "learning_rate": 0.1, "loss": 2.546286106109619, "step": 1752 }, { "epoch": 0.05568253968253968, "grad_norm": 0.21875, "learning_rate": 0.1, "loss": 2.5608971118927, "step": 1754 }, { "epoch": 0.055746031746031745, "grad_norm": 0.2041015625, "learning_rate": 0.1, "loss": 2.54545259475708, "step": 1756 }, { "epoch": 0.05580952380952381, "grad_norm": 0.1708984375, "learning_rate": 0.1, "loss": 2.548344612121582, "step": 1758 }, { "epoch": 0.05587301587301587, "grad_norm": 0.1904296875, "learning_rate": 0.1, "loss": 2.5470998287200928, "step": 1760 }, { "epoch": 0.05593650793650794, "grad_norm": 0.2119140625, "learning_rate": 0.1, "loss": 2.5637171268463135, "step": 1762 }, { "epoch": 0.056, "grad_norm": 0.1064453125, "learning_rate": 0.1, "loss": 2.5491206645965576, "step": 1764 }, { "epoch": 0.056063492063492065, "grad_norm": 0.1435546875, "learning_rate": 0.1, "loss": 2.5493226051330566, "step": 1766 }, { "epoch": 0.05612698412698413, "grad_norm": 0.087890625, "learning_rate": 0.1, "loss": 2.5420916080474854, "step": 1768 }, { "epoch": 0.05619047619047619, "grad_norm": 0.162109375, "learning_rate": 0.1, "loss": 2.510344982147217, "step": 1770 }, { "epoch": 0.05625396825396825, "grad_norm": 0.287109375, "learning_rate": 0.1, "loss": 2.555835723876953, "step": 1772 }, { "epoch": 0.056317460317460315, "grad_norm": 0.265625, "learning_rate": 0.1, "loss": 2.554429292678833, "step": 1774 }, { "epoch": 0.05638095238095238, "grad_norm": 0.189453125, "learning_rate": 0.1, "loss": 2.529087781906128, "step": 1776 }, { "epoch": 0.05644444444444444, "grad_norm": 0.5078125, "learning_rate": 0.1, "loss": 2.5441064834594727, "step": 1778 }, { "epoch": 0.05650793650793651, "grad_norm": 0.29296875, "learning_rate": 0.1, "loss": 2.541973352432251, "step": 1780 }, { "epoch": 0.05657142857142857, "grad_norm": 0.1328125, "learning_rate": 0.1, "loss": 2.543414354324341, "step": 1782 }, { "epoch": 0.056634920634920635, "grad_norm": 0.244140625, "learning_rate": 0.1, "loss": 2.518357276916504, "step": 1784 }, { "epoch": 0.0566984126984127, "grad_norm": 0.189453125, "learning_rate": 0.1, "loss": 2.529357433319092, "step": 1786 }, { "epoch": 0.05676190476190476, "grad_norm": 0.1328125, "learning_rate": 0.1, "loss": 2.5181009769439697, "step": 1788 }, { "epoch": 0.05682539682539683, "grad_norm": 0.087890625, "learning_rate": 0.1, "loss": 2.5359349250793457, "step": 1790 }, { "epoch": 0.05688888888888889, "grad_norm": 0.2021484375, "learning_rate": 0.1, "loss": 2.544494390487671, "step": 1792 }, { "epoch": 0.056952380952380956, "grad_norm": 0.37890625, "learning_rate": 0.1, "loss": 2.499652862548828, "step": 1794 }, { "epoch": 0.05701587301587301, "grad_norm": 0.48828125, "learning_rate": 0.1, "loss": 2.551034927368164, "step": 1796 }, { "epoch": 0.05707936507936508, "grad_norm": 0.1103515625, "learning_rate": 0.1, "loss": 2.5485610961914062, "step": 1798 }, { "epoch": 0.05714285714285714, "grad_norm": 0.177734375, "learning_rate": 0.1, "loss": 2.5431559085845947, "step": 1800 }, { "epoch": 0.057206349206349205, "grad_norm": 0.1572265625, "learning_rate": 0.1, "loss": 2.5219719409942627, "step": 1802 }, { "epoch": 0.05726984126984127, "grad_norm": 0.10888671875, "learning_rate": 0.1, "loss": 2.518449068069458, "step": 1804 }, { "epoch": 0.05733333333333333, "grad_norm": 0.12890625, "learning_rate": 0.1, "loss": 2.5470104217529297, "step": 1806 }, { "epoch": 0.0573968253968254, "grad_norm": 0.15234375, "learning_rate": 0.1, "loss": 2.5278737545013428, "step": 1808 }, { "epoch": 0.05746031746031746, "grad_norm": 0.341796875, "learning_rate": 0.1, "loss": 2.5266966819763184, "step": 1810 }, { "epoch": 0.057523809523809526, "grad_norm": 0.2294921875, "learning_rate": 0.1, "loss": 2.537440061569214, "step": 1812 }, { "epoch": 0.05758730158730159, "grad_norm": 0.11376953125, "learning_rate": 0.1, "loss": 2.503722667694092, "step": 1814 }, { "epoch": 0.057650793650793654, "grad_norm": 0.408203125, "learning_rate": 0.1, "loss": 2.5272412300109863, "step": 1816 }, { "epoch": 0.05771428571428571, "grad_norm": 0.44921875, "learning_rate": 0.1, "loss": 2.524090051651001, "step": 1818 }, { "epoch": 0.057777777777777775, "grad_norm": 0.1796875, "learning_rate": 0.1, "loss": 2.513633966445923, "step": 1820 }, { "epoch": 0.05784126984126984, "grad_norm": 0.0830078125, "learning_rate": 0.1, "loss": 2.537795066833496, "step": 1822 }, { "epoch": 0.0579047619047619, "grad_norm": 0.1279296875, "learning_rate": 0.1, "loss": 2.524052858352661, "step": 1824 }, { "epoch": 0.05796825396825397, "grad_norm": 0.19921875, "learning_rate": 0.1, "loss": 2.5261220932006836, "step": 1826 }, { "epoch": 0.05803174603174603, "grad_norm": 0.1640625, "learning_rate": 0.1, "loss": 2.5503945350646973, "step": 1828 }, { "epoch": 0.058095238095238096, "grad_norm": 0.2734375, "learning_rate": 0.1, "loss": 2.5280978679656982, "step": 1830 }, { "epoch": 0.05815873015873016, "grad_norm": 0.333984375, "learning_rate": 0.1, "loss": 2.5436899662017822, "step": 1832 }, { "epoch": 0.058222222222222224, "grad_norm": 0.34765625, "learning_rate": 0.1, "loss": 2.5031402111053467, "step": 1834 }, { "epoch": 0.05828571428571429, "grad_norm": 0.2109375, "learning_rate": 0.1, "loss": 2.5325429439544678, "step": 1836 }, { "epoch": 0.05834920634920635, "grad_norm": 0.1142578125, "learning_rate": 0.1, "loss": 2.537998676300049, "step": 1838 }, { "epoch": 0.058412698412698416, "grad_norm": 0.201171875, "learning_rate": 0.1, "loss": 2.4975340366363525, "step": 1840 }, { "epoch": 0.05847619047619047, "grad_norm": 0.2265625, "learning_rate": 0.1, "loss": 2.484893321990967, "step": 1842 }, { "epoch": 0.05853968253968254, "grad_norm": 0.12109375, "learning_rate": 0.1, "loss": 2.485283136367798, "step": 1844 }, { "epoch": 0.0586031746031746, "grad_norm": 0.107421875, "learning_rate": 0.1, "loss": 2.5221755504608154, "step": 1846 }, { "epoch": 0.058666666666666666, "grad_norm": 0.07177734375, "learning_rate": 0.1, "loss": 2.5147149562835693, "step": 1848 }, { "epoch": 0.05873015873015873, "grad_norm": 0.12255859375, "learning_rate": 0.1, "loss": 2.495335578918457, "step": 1850 }, { "epoch": 0.058793650793650794, "grad_norm": 0.21484375, "learning_rate": 0.1, "loss": 2.5122292041778564, "step": 1852 }, { "epoch": 0.05885714285714286, "grad_norm": 0.40234375, "learning_rate": 0.1, "loss": 2.5133631229400635, "step": 1854 }, { "epoch": 0.05892063492063492, "grad_norm": 0.265625, "learning_rate": 0.1, "loss": 2.513652801513672, "step": 1856 }, { "epoch": 0.058984126984126986, "grad_norm": 0.1591796875, "learning_rate": 0.1, "loss": 2.5211727619171143, "step": 1858 }, { "epoch": 0.05904761904761905, "grad_norm": 0.2890625, "learning_rate": 0.1, "loss": 2.524021863937378, "step": 1860 }, { "epoch": 0.059111111111111114, "grad_norm": 0.287109375, "learning_rate": 0.1, "loss": 2.4971187114715576, "step": 1862 }, { "epoch": 0.05917460317460317, "grad_norm": 0.1796875, "learning_rate": 0.1, "loss": 2.5113980770111084, "step": 1864 }, { "epoch": 0.059238095238095236, "grad_norm": 0.1396484375, "learning_rate": 0.1, "loss": 2.5380020141601562, "step": 1866 }, { "epoch": 0.0593015873015873, "grad_norm": 0.2255859375, "learning_rate": 0.1, "loss": 2.507148027420044, "step": 1868 }, { "epoch": 0.059365079365079364, "grad_norm": 0.17578125, "learning_rate": 0.1, "loss": 2.504845380783081, "step": 1870 }, { "epoch": 0.05942857142857143, "grad_norm": 0.38671875, "learning_rate": 0.1, "loss": 2.5059828758239746, "step": 1872 }, { "epoch": 0.05949206349206349, "grad_norm": 0.341796875, "learning_rate": 0.1, "loss": 2.517310380935669, "step": 1874 }, { "epoch": 0.059555555555555556, "grad_norm": 0.15625, "learning_rate": 0.1, "loss": 2.5400493144989014, "step": 1876 }, { "epoch": 0.05961904761904762, "grad_norm": 0.10302734375, "learning_rate": 0.1, "loss": 2.5164201259613037, "step": 1878 }, { "epoch": 0.059682539682539684, "grad_norm": 0.09326171875, "learning_rate": 0.1, "loss": 2.5032596588134766, "step": 1880 }, { "epoch": 0.05974603174603175, "grad_norm": 0.2412109375, "learning_rate": 0.1, "loss": 2.5329995155334473, "step": 1882 }, { "epoch": 0.05980952380952381, "grad_norm": 0.19140625, "learning_rate": 0.1, "loss": 2.5129592418670654, "step": 1884 }, { "epoch": 0.05987301587301587, "grad_norm": 0.095703125, "learning_rate": 0.1, "loss": 2.5000452995300293, "step": 1886 }, { "epoch": 0.059936507936507934, "grad_norm": 0.2099609375, "learning_rate": 0.1, "loss": 2.5494816303253174, "step": 1888 }, { "epoch": 0.06, "grad_norm": 0.51171875, "learning_rate": 0.1, "loss": 2.5271501541137695, "step": 1890 }, { "epoch": 0.06006349206349206, "grad_norm": 0.294921875, "learning_rate": 0.1, "loss": 2.5172698497772217, "step": 1892 }, { "epoch": 0.060126984126984126, "grad_norm": 0.08984375, "learning_rate": 0.1, "loss": 2.5025341510772705, "step": 1894 }, { "epoch": 0.06019047619047619, "grad_norm": 0.14453125, "learning_rate": 0.1, "loss": 2.5273935794830322, "step": 1896 }, { "epoch": 0.060253968253968254, "grad_norm": 0.455078125, "learning_rate": 0.1, "loss": 2.5263943672180176, "step": 1898 }, { "epoch": 0.06031746031746032, "grad_norm": 0.2421875, "learning_rate": 0.1, "loss": 2.547755002975464, "step": 1900 }, { "epoch": 0.06038095238095238, "grad_norm": 0.1298828125, "learning_rate": 0.1, "loss": 2.529520034790039, "step": 1902 }, { "epoch": 0.060444444444444446, "grad_norm": 0.11474609375, "learning_rate": 0.1, "loss": 2.5250437259674072, "step": 1904 }, { "epoch": 0.06050793650793651, "grad_norm": 0.1865234375, "learning_rate": 0.1, "loss": 2.520601272583008, "step": 1906 }, { "epoch": 0.060571428571428575, "grad_norm": 0.11767578125, "learning_rate": 0.1, "loss": 2.541776657104492, "step": 1908 }, { "epoch": 0.06063492063492063, "grad_norm": 0.150390625, "learning_rate": 0.1, "loss": 2.526111602783203, "step": 1910 }, { "epoch": 0.060698412698412696, "grad_norm": 0.1650390625, "learning_rate": 0.1, "loss": 2.534066915512085, "step": 1912 }, { "epoch": 0.06076190476190476, "grad_norm": 0.1201171875, "learning_rate": 0.1, "loss": 2.528672218322754, "step": 1914 }, { "epoch": 0.060825396825396824, "grad_norm": 0.0986328125, "learning_rate": 0.1, "loss": 2.518888235092163, "step": 1916 }, { "epoch": 0.06088888888888889, "grad_norm": 0.2080078125, "learning_rate": 0.1, "loss": 2.5354347229003906, "step": 1918 }, { "epoch": 0.06095238095238095, "grad_norm": 0.28515625, "learning_rate": 0.1, "loss": 2.5207362174987793, "step": 1920 }, { "epoch": 0.061015873015873016, "grad_norm": 0.09228515625, "learning_rate": 0.1, "loss": 2.5410056114196777, "step": 1922 }, { "epoch": 0.06107936507936508, "grad_norm": 0.48046875, "learning_rate": 0.1, "loss": 2.545945405960083, "step": 1924 }, { "epoch": 0.061142857142857145, "grad_norm": 0.455078125, "learning_rate": 0.1, "loss": 2.541351795196533, "step": 1926 }, { "epoch": 0.06120634920634921, "grad_norm": 0.24609375, "learning_rate": 0.1, "loss": 2.5412797927856445, "step": 1928 }, { "epoch": 0.06126984126984127, "grad_norm": 0.1103515625, "learning_rate": 0.1, "loss": 2.5232605934143066, "step": 1930 }, { "epoch": 0.06133333333333333, "grad_norm": 0.2255859375, "learning_rate": 0.1, "loss": 2.5424904823303223, "step": 1932 }, { "epoch": 0.061396825396825394, "grad_norm": 0.2734375, "learning_rate": 0.1, "loss": 2.5219290256500244, "step": 1934 }, { "epoch": 0.06146031746031746, "grad_norm": 0.12060546875, "learning_rate": 0.1, "loss": 2.5239145755767822, "step": 1936 }, { "epoch": 0.06152380952380952, "grad_norm": 0.1240234375, "learning_rate": 0.1, "loss": 2.527787923812866, "step": 1938 }, { "epoch": 0.061587301587301586, "grad_norm": 0.1181640625, "learning_rate": 0.1, "loss": 2.4958066940307617, "step": 1940 }, { "epoch": 0.06165079365079365, "grad_norm": 0.1689453125, "learning_rate": 0.1, "loss": 2.5490989685058594, "step": 1942 }, { "epoch": 0.061714285714285715, "grad_norm": 0.166015625, "learning_rate": 0.1, "loss": 2.502736806869507, "step": 1944 }, { "epoch": 0.06177777777777778, "grad_norm": 0.25390625, "learning_rate": 0.1, "loss": 2.5124857425689697, "step": 1946 }, { "epoch": 0.06184126984126984, "grad_norm": 0.28125, "learning_rate": 0.1, "loss": 2.5259721279144287, "step": 1948 }, { "epoch": 0.06190476190476191, "grad_norm": 0.251953125, "learning_rate": 0.1, "loss": 2.5229554176330566, "step": 1950 }, { "epoch": 0.06196825396825397, "grad_norm": 0.10009765625, "learning_rate": 0.1, "loss": 2.498845100402832, "step": 1952 }, { "epoch": 0.062031746031746035, "grad_norm": 0.328125, "learning_rate": 0.1, "loss": 2.519442081451416, "step": 1954 }, { "epoch": 0.06209523809523809, "grad_norm": 0.2265625, "learning_rate": 0.1, "loss": 2.5344386100769043, "step": 1956 }, { "epoch": 0.062158730158730156, "grad_norm": 0.1904296875, "learning_rate": 0.1, "loss": 2.510472536087036, "step": 1958 }, { "epoch": 0.06222222222222222, "grad_norm": 0.1015625, "learning_rate": 0.1, "loss": 2.512559652328491, "step": 1960 }, { "epoch": 0.062285714285714285, "grad_norm": 0.1962890625, "learning_rate": 0.1, "loss": 2.5448036193847656, "step": 1962 }, { "epoch": 0.06234920634920635, "grad_norm": 0.1396484375, "learning_rate": 0.1, "loss": 2.487271547317505, "step": 1964 }, { "epoch": 0.06241269841269841, "grad_norm": 0.28125, "learning_rate": 0.1, "loss": 2.50666880607605, "step": 1966 }, { "epoch": 0.06247619047619048, "grad_norm": 0.54296875, "learning_rate": 0.1, "loss": 2.550255060195923, "step": 1968 }, { "epoch": 0.06253968253968253, "grad_norm": 0.1533203125, "learning_rate": 0.1, "loss": 2.5325605869293213, "step": 1970 }, { "epoch": 0.0626031746031746, "grad_norm": 0.1611328125, "learning_rate": 0.1, "loss": 2.508082151412964, "step": 1972 }, { "epoch": 0.06266666666666666, "grad_norm": 0.22265625, "learning_rate": 0.1, "loss": 2.5173730850219727, "step": 1974 }, { "epoch": 0.06273015873015873, "grad_norm": 0.2216796875, "learning_rate": 0.1, "loss": 2.5099987983703613, "step": 1976 }, { "epoch": 0.06279365079365079, "grad_norm": 0.27734375, "learning_rate": 0.1, "loss": 2.5198261737823486, "step": 1978 }, { "epoch": 0.06285714285714286, "grad_norm": 0.26171875, "learning_rate": 0.1, "loss": 2.512305974960327, "step": 1980 }, { "epoch": 0.06292063492063492, "grad_norm": 0.1435546875, "learning_rate": 0.1, "loss": 2.5367496013641357, "step": 1982 }, { "epoch": 0.06298412698412699, "grad_norm": 0.322265625, "learning_rate": 0.1, "loss": 2.517648220062256, "step": 1984 }, { "epoch": 0.06304761904761905, "grad_norm": 0.6015625, "learning_rate": 0.1, "loss": 2.53326153755188, "step": 1986 }, { "epoch": 0.06311111111111112, "grad_norm": 0.173828125, "learning_rate": 0.1, "loss": 2.5516254901885986, "step": 1988 }, { "epoch": 0.06317460317460317, "grad_norm": 0.150390625, "learning_rate": 0.1, "loss": 2.532486915588379, "step": 1990 }, { "epoch": 0.06323809523809523, "grad_norm": 0.166015625, "learning_rate": 0.1, "loss": 2.5476768016815186, "step": 1992 }, { "epoch": 0.0633015873015873, "grad_norm": 0.1201171875, "learning_rate": 0.1, "loss": 2.5328476428985596, "step": 1994 }, { "epoch": 0.06336507936507936, "grad_norm": 0.12060546875, "learning_rate": 0.1, "loss": 2.5030629634857178, "step": 1996 }, { "epoch": 0.06342857142857143, "grad_norm": 0.09619140625, "learning_rate": 0.1, "loss": 2.554652452468872, "step": 1998 }, { "epoch": 0.06349206349206349, "grad_norm": 0.130859375, "learning_rate": 0.1, "loss": 2.5161564350128174, "step": 2000 }, { "epoch": 0.06355555555555556, "grad_norm": 0.109375, "learning_rate": 0.1, "loss": 2.50634765625, "step": 2002 }, { "epoch": 0.06361904761904762, "grad_norm": 0.1591796875, "learning_rate": 0.1, "loss": 2.5148537158966064, "step": 2004 }, { "epoch": 0.06368253968253969, "grad_norm": 0.064453125, "learning_rate": 0.1, "loss": 2.5011837482452393, "step": 2006 }, { "epoch": 0.06374603174603174, "grad_norm": 0.58984375, "learning_rate": 0.1, "loss": 2.507392406463623, "step": 2008 }, { "epoch": 0.06380952380952382, "grad_norm": 0.32421875, "learning_rate": 0.1, "loss": 2.5286784172058105, "step": 2010 }, { "epoch": 0.06387301587301587, "grad_norm": 0.1533203125, "learning_rate": 0.1, "loss": 2.5311315059661865, "step": 2012 }, { "epoch": 0.06393650793650793, "grad_norm": 0.0732421875, "learning_rate": 0.1, "loss": 2.52327299118042, "step": 2014 }, { "epoch": 0.064, "grad_norm": 0.12158203125, "learning_rate": 0.1, "loss": 2.4933981895446777, "step": 2016 }, { "epoch": 0.06406349206349206, "grad_norm": 0.10595703125, "learning_rate": 0.1, "loss": 2.495870351791382, "step": 2018 }, { "epoch": 0.06412698412698413, "grad_norm": 0.12158203125, "learning_rate": 0.1, "loss": 2.5211164951324463, "step": 2020 }, { "epoch": 0.06419047619047619, "grad_norm": 0.1591796875, "learning_rate": 0.1, "loss": 2.53830885887146, "step": 2022 }, { "epoch": 0.06425396825396826, "grad_norm": 0.173828125, "learning_rate": 0.1, "loss": 2.5131025314331055, "step": 2024 }, { "epoch": 0.06431746031746031, "grad_norm": 0.1474609375, "learning_rate": 0.1, "loss": 2.503647804260254, "step": 2026 }, { "epoch": 0.06438095238095239, "grad_norm": 0.2294921875, "learning_rate": 0.1, "loss": 2.492401361465454, "step": 2028 }, { "epoch": 0.06444444444444444, "grad_norm": 0.41796875, "learning_rate": 0.1, "loss": 2.514190912246704, "step": 2030 }, { "epoch": 0.06450793650793651, "grad_norm": 0.201171875, "learning_rate": 0.1, "loss": 2.512331247329712, "step": 2032 }, { "epoch": 0.06457142857142857, "grad_norm": 0.11376953125, "learning_rate": 0.1, "loss": 2.4942357540130615, "step": 2034 }, { "epoch": 0.06463492063492063, "grad_norm": 0.283203125, "learning_rate": 0.1, "loss": 2.500631093978882, "step": 2036 }, { "epoch": 0.0646984126984127, "grad_norm": 0.275390625, "learning_rate": 0.1, "loss": 2.5043468475341797, "step": 2038 }, { "epoch": 0.06476190476190476, "grad_norm": 0.1533203125, "learning_rate": 0.1, "loss": 2.5141701698303223, "step": 2040 }, { "epoch": 0.06482539682539683, "grad_norm": 0.2490234375, "learning_rate": 0.1, "loss": 2.483848810195923, "step": 2042 }, { "epoch": 0.06488888888888888, "grad_norm": 0.2099609375, "learning_rate": 0.1, "loss": 2.5030460357666016, "step": 2044 }, { "epoch": 0.06495238095238096, "grad_norm": 0.279296875, "learning_rate": 0.1, "loss": 2.470506429672241, "step": 2046 }, { "epoch": 0.06501587301587301, "grad_norm": 0.306640625, "learning_rate": 0.1, "loss": 2.51155948638916, "step": 2048 }, { "epoch": 0.06507936507936508, "grad_norm": 0.302734375, "learning_rate": 0.1, "loss": 2.4815001487731934, "step": 2050 }, { "epoch": 0.06514285714285714, "grad_norm": 0.17578125, "learning_rate": 0.1, "loss": 2.522413969039917, "step": 2052 }, { "epoch": 0.06520634920634921, "grad_norm": 0.22265625, "learning_rate": 0.1, "loss": 2.4877471923828125, "step": 2054 }, { "epoch": 0.06526984126984127, "grad_norm": 0.2138671875, "learning_rate": 0.1, "loss": 2.4903132915496826, "step": 2056 }, { "epoch": 0.06533333333333333, "grad_norm": 0.10693359375, "learning_rate": 0.1, "loss": 2.503127336502075, "step": 2058 }, { "epoch": 0.0653968253968254, "grad_norm": 0.146484375, "learning_rate": 0.1, "loss": 2.4925239086151123, "step": 2060 }, { "epoch": 0.06546031746031745, "grad_norm": 0.21484375, "learning_rate": 0.1, "loss": 2.486431360244751, "step": 2062 }, { "epoch": 0.06552380952380953, "grad_norm": 0.16796875, "learning_rate": 0.1, "loss": 2.517045259475708, "step": 2064 }, { "epoch": 0.06558730158730158, "grad_norm": 0.1728515625, "learning_rate": 0.1, "loss": 2.4944334030151367, "step": 2066 }, { "epoch": 0.06565079365079365, "grad_norm": 0.376953125, "learning_rate": 0.1, "loss": 2.492345094680786, "step": 2068 }, { "epoch": 0.06571428571428571, "grad_norm": 0.37890625, "learning_rate": 0.1, "loss": 2.4673092365264893, "step": 2070 }, { "epoch": 0.06577777777777778, "grad_norm": 0.150390625, "learning_rate": 0.1, "loss": 2.478743553161621, "step": 2072 }, { "epoch": 0.06584126984126984, "grad_norm": 0.193359375, "learning_rate": 0.1, "loss": 2.4679133892059326, "step": 2074 }, { "epoch": 0.06590476190476191, "grad_norm": 0.2138671875, "learning_rate": 0.1, "loss": 2.4856796264648438, "step": 2076 }, { "epoch": 0.06596825396825397, "grad_norm": 0.1787109375, "learning_rate": 0.1, "loss": 2.48044753074646, "step": 2078 }, { "epoch": 0.06603174603174604, "grad_norm": 0.1298828125, "learning_rate": 0.1, "loss": 2.4721012115478516, "step": 2080 }, { "epoch": 0.0660952380952381, "grad_norm": 0.291015625, "learning_rate": 0.1, "loss": 2.4845523834228516, "step": 2082 }, { "epoch": 0.06615873015873015, "grad_norm": 0.1845703125, "learning_rate": 0.1, "loss": 2.4772167205810547, "step": 2084 }, { "epoch": 0.06622222222222222, "grad_norm": 0.1572265625, "learning_rate": 0.1, "loss": 2.5277273654937744, "step": 2086 }, { "epoch": 0.06628571428571428, "grad_norm": 0.232421875, "learning_rate": 0.1, "loss": 2.489206314086914, "step": 2088 }, { "epoch": 0.06634920634920635, "grad_norm": 0.1533203125, "learning_rate": 0.1, "loss": 2.4865260124206543, "step": 2090 }, { "epoch": 0.06641269841269841, "grad_norm": 0.1689453125, "learning_rate": 0.1, "loss": 2.472278594970703, "step": 2092 }, { "epoch": 0.06647619047619048, "grad_norm": 0.1669921875, "learning_rate": 0.1, "loss": 2.4957051277160645, "step": 2094 }, { "epoch": 0.06653968253968254, "grad_norm": 0.16015625, "learning_rate": 0.1, "loss": 2.491830348968506, "step": 2096 }, { "epoch": 0.06660317460317461, "grad_norm": 0.2255859375, "learning_rate": 0.1, "loss": 2.471398115158081, "step": 2098 }, { "epoch": 0.06666666666666667, "grad_norm": 0.388671875, "learning_rate": 0.1, "loss": 2.492751359939575, "step": 2100 }, { "epoch": 0.06673015873015874, "grad_norm": 0.0888671875, "learning_rate": 0.1, "loss": 2.484971284866333, "step": 2102 }, { "epoch": 0.0667936507936508, "grad_norm": 0.1806640625, "learning_rate": 0.1, "loss": 2.4755873680114746, "step": 2104 }, { "epoch": 0.06685714285714285, "grad_norm": 0.185546875, "learning_rate": 0.1, "loss": 2.4842822551727295, "step": 2106 }, { "epoch": 0.06692063492063492, "grad_norm": 0.1650390625, "learning_rate": 0.1, "loss": 2.4596633911132812, "step": 2108 }, { "epoch": 0.06698412698412698, "grad_norm": 0.193359375, "learning_rate": 0.1, "loss": 2.4494075775146484, "step": 2110 }, { "epoch": 0.06704761904761905, "grad_norm": 0.361328125, "learning_rate": 0.1, "loss": 2.4454522132873535, "step": 2112 }, { "epoch": 0.06711111111111111, "grad_norm": 0.439453125, "learning_rate": 0.1, "loss": 2.485635280609131, "step": 2114 }, { "epoch": 0.06717460317460318, "grad_norm": 0.07177734375, "learning_rate": 0.1, "loss": 2.4559006690979004, "step": 2116 }, { "epoch": 0.06723809523809524, "grad_norm": 0.150390625, "learning_rate": 0.1, "loss": 2.4935600757598877, "step": 2118 }, { "epoch": 0.0673015873015873, "grad_norm": 0.193359375, "learning_rate": 0.1, "loss": 2.469684600830078, "step": 2120 }, { "epoch": 0.06736507936507936, "grad_norm": 0.166015625, "learning_rate": 0.1, "loss": 2.480215072631836, "step": 2122 }, { "epoch": 0.06742857142857143, "grad_norm": 0.373046875, "learning_rate": 0.1, "loss": 2.483480215072632, "step": 2124 }, { "epoch": 0.06749206349206349, "grad_norm": 0.279296875, "learning_rate": 0.1, "loss": 2.4839725494384766, "step": 2126 }, { "epoch": 0.06755555555555555, "grad_norm": 0.08984375, "learning_rate": 0.1, "loss": 2.4860727787017822, "step": 2128 }, { "epoch": 0.06761904761904762, "grad_norm": 0.2275390625, "learning_rate": 0.1, "loss": 2.4911935329437256, "step": 2130 }, { "epoch": 0.06768253968253968, "grad_norm": 0.251953125, "learning_rate": 0.1, "loss": 2.4639229774475098, "step": 2132 }, { "epoch": 0.06774603174603175, "grad_norm": 0.15625, "learning_rate": 0.1, "loss": 2.4809765815734863, "step": 2134 }, { "epoch": 0.0678095238095238, "grad_norm": 0.162109375, "learning_rate": 0.1, "loss": 2.4845449924468994, "step": 2136 }, { "epoch": 0.06787301587301588, "grad_norm": 0.2109375, "learning_rate": 0.1, "loss": 2.457502841949463, "step": 2138 }, { "epoch": 0.06793650793650793, "grad_norm": 0.2353515625, "learning_rate": 0.1, "loss": 2.472931385040283, "step": 2140 }, { "epoch": 0.068, "grad_norm": 0.12158203125, "learning_rate": 0.1, "loss": 2.477206230163574, "step": 2142 }, { "epoch": 0.06806349206349206, "grad_norm": 0.380859375, "learning_rate": 0.1, "loss": 2.477062225341797, "step": 2144 }, { "epoch": 0.06812698412698413, "grad_norm": 0.515625, "learning_rate": 0.1, "loss": 2.4625442028045654, "step": 2146 }, { "epoch": 0.06819047619047619, "grad_norm": 0.057861328125, "learning_rate": 0.1, "loss": 2.458501100540161, "step": 2148 }, { "epoch": 0.06825396825396825, "grad_norm": 0.06884765625, "learning_rate": 0.1, "loss": 2.4825305938720703, "step": 2150 }, { "epoch": 0.06831746031746032, "grad_norm": 0.2158203125, "learning_rate": 0.1, "loss": 2.468508243560791, "step": 2152 }, { "epoch": 0.06838095238095238, "grad_norm": 0.3203125, "learning_rate": 0.1, "loss": 2.500572443008423, "step": 2154 }, { "epoch": 0.06844444444444445, "grad_norm": 0.271484375, "learning_rate": 0.1, "loss": 2.4776880741119385, "step": 2156 }, { "epoch": 0.0685079365079365, "grad_norm": 0.2412109375, "learning_rate": 0.1, "loss": 2.4921162128448486, "step": 2158 }, { "epoch": 0.06857142857142857, "grad_norm": 0.251953125, "learning_rate": 0.1, "loss": 2.4685893058776855, "step": 2160 }, { "epoch": 0.06863492063492063, "grad_norm": 0.0810546875, "learning_rate": 0.1, "loss": 2.458979606628418, "step": 2162 }, { "epoch": 0.0686984126984127, "grad_norm": 0.1279296875, "learning_rate": 0.1, "loss": 2.4854941368103027, "step": 2164 }, { "epoch": 0.06876190476190476, "grad_norm": 0.20703125, "learning_rate": 0.1, "loss": 2.497706413269043, "step": 2166 }, { "epoch": 0.06882539682539683, "grad_norm": 0.2216796875, "learning_rate": 0.1, "loss": 2.474457025527954, "step": 2168 }, { "epoch": 0.06888888888888889, "grad_norm": 0.1962890625, "learning_rate": 0.1, "loss": 2.4929616451263428, "step": 2170 }, { "epoch": 0.06895238095238095, "grad_norm": 0.1298828125, "learning_rate": 0.1, "loss": 2.4676895141601562, "step": 2172 }, { "epoch": 0.06901587301587302, "grad_norm": 0.2294921875, "learning_rate": 0.1, "loss": 2.4944210052490234, "step": 2174 }, { "epoch": 0.06907936507936507, "grad_norm": 0.3203125, "learning_rate": 0.1, "loss": 2.4872589111328125, "step": 2176 }, { "epoch": 0.06914285714285714, "grad_norm": 0.29296875, "learning_rate": 0.1, "loss": 2.468313694000244, "step": 2178 }, { "epoch": 0.0692063492063492, "grad_norm": 0.171875, "learning_rate": 0.1, "loss": 2.479515552520752, "step": 2180 }, { "epoch": 0.06926984126984127, "grad_norm": 0.11767578125, "learning_rate": 0.1, "loss": 2.467926025390625, "step": 2182 }, { "epoch": 0.06933333333333333, "grad_norm": 0.11572265625, "learning_rate": 0.1, "loss": 2.4687485694885254, "step": 2184 }, { "epoch": 0.0693968253968254, "grad_norm": 0.11767578125, "learning_rate": 0.1, "loss": 2.47172474861145, "step": 2186 }, { "epoch": 0.06946031746031746, "grad_norm": 0.08056640625, "learning_rate": 0.1, "loss": 2.523641586303711, "step": 2188 }, { "epoch": 0.06952380952380953, "grad_norm": 0.1552734375, "learning_rate": 0.1, "loss": 2.4597578048706055, "step": 2190 }, { "epoch": 0.06958730158730159, "grad_norm": 0.0908203125, "learning_rate": 0.1, "loss": 2.482039451599121, "step": 2192 }, { "epoch": 0.06965079365079366, "grad_norm": 0.07373046875, "learning_rate": 0.1, "loss": 2.4621119499206543, "step": 2194 }, { "epoch": 0.06971428571428571, "grad_norm": 0.0947265625, "learning_rate": 0.1, "loss": 2.478515863418579, "step": 2196 }, { "epoch": 0.06977777777777777, "grad_norm": 0.212890625, "learning_rate": 0.1, "loss": 2.4821345806121826, "step": 2198 }, { "epoch": 0.06984126984126984, "grad_norm": 0.462890625, "learning_rate": 0.1, "loss": 2.493859052658081, "step": 2200 }, { "epoch": 0.0699047619047619, "grad_norm": 0.35546875, "learning_rate": 0.1, "loss": 2.492931604385376, "step": 2202 }, { "epoch": 0.06996825396825397, "grad_norm": 0.15234375, "learning_rate": 0.1, "loss": 2.4553074836730957, "step": 2204 }, { "epoch": 0.07003174603174603, "grad_norm": 0.1748046875, "learning_rate": 0.1, "loss": 2.491086959838867, "step": 2206 }, { "epoch": 0.0700952380952381, "grad_norm": 0.0703125, "learning_rate": 0.1, "loss": 2.4971084594726562, "step": 2208 }, { "epoch": 0.07015873015873016, "grad_norm": 0.10498046875, "learning_rate": 0.1, "loss": 2.4661858081817627, "step": 2210 }, { "epoch": 0.07022222222222223, "grad_norm": 0.4609375, "learning_rate": 0.1, "loss": 2.4871621131896973, "step": 2212 }, { "epoch": 0.07028571428571428, "grad_norm": 0.859375, "learning_rate": 0.1, "loss": 2.4745965003967285, "step": 2214 }, { "epoch": 0.07034920634920636, "grad_norm": 0.07373046875, "learning_rate": 0.1, "loss": 2.4638142585754395, "step": 2216 }, { "epoch": 0.07041269841269841, "grad_norm": 0.09228515625, "learning_rate": 0.1, "loss": 2.50646710395813, "step": 2218 }, { "epoch": 0.07047619047619047, "grad_norm": 0.07373046875, "learning_rate": 0.1, "loss": 2.475024461746216, "step": 2220 }, { "epoch": 0.07053968253968254, "grad_norm": 0.08056640625, "learning_rate": 0.1, "loss": 2.4500625133514404, "step": 2222 }, { "epoch": 0.0706031746031746, "grad_norm": 0.1826171875, "learning_rate": 0.1, "loss": 2.435464382171631, "step": 2224 }, { "epoch": 0.07066666666666667, "grad_norm": 0.12451171875, "learning_rate": 0.1, "loss": 2.4705374240875244, "step": 2226 }, { "epoch": 0.07073015873015873, "grad_norm": 0.1787109375, "learning_rate": 0.1, "loss": 2.4763388633728027, "step": 2228 }, { "epoch": 0.0707936507936508, "grad_norm": 0.125, "learning_rate": 0.1, "loss": 2.4758315086364746, "step": 2230 }, { "epoch": 0.07085714285714285, "grad_norm": 0.197265625, "learning_rate": 0.1, "loss": 2.4652233123779297, "step": 2232 }, { "epoch": 0.07092063492063493, "grad_norm": 0.10107421875, "learning_rate": 0.1, "loss": 2.481559991836548, "step": 2234 }, { "epoch": 0.07098412698412698, "grad_norm": 0.1845703125, "learning_rate": 0.1, "loss": 2.471065044403076, "step": 2236 }, { "epoch": 0.07104761904761905, "grad_norm": 0.1904296875, "learning_rate": 0.1, "loss": 2.4638373851776123, "step": 2238 }, { "epoch": 0.07111111111111111, "grad_norm": 0.134765625, "learning_rate": 0.1, "loss": 2.464317798614502, "step": 2240 }, { "epoch": 0.07117460317460317, "grad_norm": 0.1484375, "learning_rate": 0.1, "loss": 2.4907915592193604, "step": 2242 }, { "epoch": 0.07123809523809524, "grad_norm": 0.275390625, "learning_rate": 0.1, "loss": 2.461716651916504, "step": 2244 }, { "epoch": 0.0713015873015873, "grad_norm": 0.27734375, "learning_rate": 0.1, "loss": 2.482943534851074, "step": 2246 }, { "epoch": 0.07136507936507937, "grad_norm": 0.1640625, "learning_rate": 0.1, "loss": 2.4873404502868652, "step": 2248 }, { "epoch": 0.07142857142857142, "grad_norm": 0.2392578125, "learning_rate": 0.1, "loss": 2.4674205780029297, "step": 2250 }, { "epoch": 0.0714920634920635, "grad_norm": 0.138671875, "learning_rate": 0.1, "loss": 2.483459234237671, "step": 2252 }, { "epoch": 0.07155555555555555, "grad_norm": 0.1123046875, "learning_rate": 0.1, "loss": 2.4682528972625732, "step": 2254 }, { "epoch": 0.07161904761904762, "grad_norm": 0.1787109375, "learning_rate": 0.1, "loss": 2.4706039428710938, "step": 2256 }, { "epoch": 0.07168253968253968, "grad_norm": 0.126953125, "learning_rate": 0.1, "loss": 2.4639344215393066, "step": 2258 }, { "epoch": 0.07174603174603175, "grad_norm": 0.2041015625, "learning_rate": 0.1, "loss": 2.4746005535125732, "step": 2260 }, { "epoch": 0.07180952380952381, "grad_norm": 0.12109375, "learning_rate": 0.1, "loss": 2.4825053215026855, "step": 2262 }, { "epoch": 0.07187301587301587, "grad_norm": 0.201171875, "learning_rate": 0.1, "loss": 2.4548676013946533, "step": 2264 }, { "epoch": 0.07193650793650794, "grad_norm": 0.1787109375, "learning_rate": 0.1, "loss": 2.4568216800689697, "step": 2266 }, { "epoch": 0.072, "grad_norm": 0.2001953125, "learning_rate": 0.1, "loss": 2.4841418266296387, "step": 2268 }, { "epoch": 0.07206349206349207, "grad_norm": 0.2216796875, "learning_rate": 0.1, "loss": 2.4694817066192627, "step": 2270 }, { "epoch": 0.07212698412698412, "grad_norm": 0.1357421875, "learning_rate": 0.1, "loss": 2.4437460899353027, "step": 2272 }, { "epoch": 0.0721904761904762, "grad_norm": 0.1337890625, "learning_rate": 0.1, "loss": 2.457486629486084, "step": 2274 }, { "epoch": 0.07225396825396825, "grad_norm": 0.1689453125, "learning_rate": 0.1, "loss": 2.440682888031006, "step": 2276 }, { "epoch": 0.07231746031746032, "grad_norm": 0.44921875, "learning_rate": 0.1, "loss": 2.487182855606079, "step": 2278 }, { "epoch": 0.07238095238095238, "grad_norm": 0.439453125, "learning_rate": 0.1, "loss": 2.489858388900757, "step": 2280 }, { "epoch": 0.07244444444444445, "grad_norm": 0.1552734375, "learning_rate": 0.1, "loss": 2.458692789077759, "step": 2282 }, { "epoch": 0.07250793650793651, "grad_norm": 0.1884765625, "learning_rate": 0.1, "loss": 2.456444501876831, "step": 2284 }, { "epoch": 0.07257142857142856, "grad_norm": 0.271484375, "learning_rate": 0.1, "loss": 2.4622771739959717, "step": 2286 }, { "epoch": 0.07263492063492064, "grad_norm": 0.1884765625, "learning_rate": 0.1, "loss": 2.459183931350708, "step": 2288 }, { "epoch": 0.07269841269841269, "grad_norm": 0.1181640625, "learning_rate": 0.1, "loss": 2.452925682067871, "step": 2290 }, { "epoch": 0.07276190476190476, "grad_norm": 0.19921875, "learning_rate": 0.1, "loss": 2.470534324645996, "step": 2292 }, { "epoch": 0.07282539682539682, "grad_norm": 0.197265625, "learning_rate": 0.1, "loss": 2.4595272541046143, "step": 2294 }, { "epoch": 0.07288888888888889, "grad_norm": 0.267578125, "learning_rate": 0.1, "loss": 2.4717190265655518, "step": 2296 }, { "epoch": 0.07295238095238095, "grad_norm": 0.2138671875, "learning_rate": 0.1, "loss": 2.4618430137634277, "step": 2298 }, { "epoch": 0.07301587301587302, "grad_norm": 0.1748046875, "learning_rate": 0.1, "loss": 2.48429536819458, "step": 2300 }, { "epoch": 0.07307936507936508, "grad_norm": 0.1552734375, "learning_rate": 0.1, "loss": 2.4697506427764893, "step": 2302 }, { "epoch": 0.07314285714285715, "grad_norm": 0.12451171875, "learning_rate": 0.1, "loss": 2.4451119899749756, "step": 2304 }, { "epoch": 0.0732063492063492, "grad_norm": 0.087890625, "learning_rate": 0.1, "loss": 2.443060874938965, "step": 2306 }, { "epoch": 0.07326984126984128, "grad_norm": 0.283203125, "learning_rate": 0.1, "loss": 2.4597675800323486, "step": 2308 }, { "epoch": 0.07333333333333333, "grad_norm": 0.279296875, "learning_rate": 0.1, "loss": 2.467695474624634, "step": 2310 }, { "epoch": 0.07339682539682539, "grad_norm": 0.1494140625, "learning_rate": 0.1, "loss": 2.4525787830352783, "step": 2312 }, { "epoch": 0.07346031746031746, "grad_norm": 0.185546875, "learning_rate": 0.1, "loss": 2.457634687423706, "step": 2314 }, { "epoch": 0.07352380952380952, "grad_norm": 0.287109375, "learning_rate": 0.1, "loss": 2.4801790714263916, "step": 2316 }, { "epoch": 0.07358730158730159, "grad_norm": 0.388671875, "learning_rate": 0.1, "loss": 2.4629147052764893, "step": 2318 }, { "epoch": 0.07365079365079365, "grad_norm": 0.17578125, "learning_rate": 0.1, "loss": 2.4725701808929443, "step": 2320 }, { "epoch": 0.07371428571428572, "grad_norm": 0.166015625, "learning_rate": 0.1, "loss": 2.4694488048553467, "step": 2322 }, { "epoch": 0.07377777777777778, "grad_norm": 0.0830078125, "learning_rate": 0.1, "loss": 2.433459997177124, "step": 2324 }, { "epoch": 0.07384126984126985, "grad_norm": 0.130859375, "learning_rate": 0.1, "loss": 2.4501688480377197, "step": 2326 }, { "epoch": 0.0739047619047619, "grad_norm": 0.30859375, "learning_rate": 0.1, "loss": 2.4614992141723633, "step": 2328 }, { "epoch": 0.07396825396825397, "grad_norm": 0.2451171875, "learning_rate": 0.1, "loss": 2.4662961959838867, "step": 2330 }, { "epoch": 0.07403174603174603, "grad_norm": 0.185546875, "learning_rate": 0.1, "loss": 2.412978172302246, "step": 2332 }, { "epoch": 0.07409523809523809, "grad_norm": 0.142578125, "learning_rate": 0.1, "loss": 2.4386823177337646, "step": 2334 }, { "epoch": 0.07415873015873016, "grad_norm": 0.177734375, "learning_rate": 0.1, "loss": 2.475775957107544, "step": 2336 }, { "epoch": 0.07422222222222222, "grad_norm": 0.2080078125, "learning_rate": 0.1, "loss": 2.4479358196258545, "step": 2338 }, { "epoch": 0.07428571428571429, "grad_norm": 0.2109375, "learning_rate": 0.1, "loss": 2.450390577316284, "step": 2340 }, { "epoch": 0.07434920634920635, "grad_norm": 0.09033203125, "learning_rate": 0.1, "loss": 2.4731197357177734, "step": 2342 }, { "epoch": 0.07441269841269842, "grad_norm": 0.263671875, "learning_rate": 0.1, "loss": 2.448122262954712, "step": 2344 }, { "epoch": 0.07447619047619047, "grad_norm": 0.2060546875, "learning_rate": 0.1, "loss": 2.448392391204834, "step": 2346 }, { "epoch": 0.07453968253968254, "grad_norm": 0.1044921875, "learning_rate": 0.1, "loss": 2.488832712173462, "step": 2348 }, { "epoch": 0.0746031746031746, "grad_norm": 0.142578125, "learning_rate": 0.1, "loss": 2.431319236755371, "step": 2350 }, { "epoch": 0.07466666666666667, "grad_norm": 0.09912109375, "learning_rate": 0.1, "loss": 2.451777219772339, "step": 2352 }, { "epoch": 0.07473015873015873, "grad_norm": 0.1826171875, "learning_rate": 0.1, "loss": 2.4640417098999023, "step": 2354 }, { "epoch": 0.07479365079365079, "grad_norm": 0.423828125, "learning_rate": 0.1, "loss": 2.4469501972198486, "step": 2356 }, { "epoch": 0.07485714285714286, "grad_norm": 0.240234375, "learning_rate": 0.1, "loss": 2.4534530639648438, "step": 2358 }, { "epoch": 0.07492063492063492, "grad_norm": 0.11767578125, "learning_rate": 0.1, "loss": 2.461578607559204, "step": 2360 }, { "epoch": 0.07498412698412699, "grad_norm": 0.248046875, "learning_rate": 0.1, "loss": 2.457587957382202, "step": 2362 }, { "epoch": 0.07504761904761904, "grad_norm": 0.380859375, "learning_rate": 0.1, "loss": 2.4580841064453125, "step": 2364 }, { "epoch": 0.07511111111111111, "grad_norm": 0.1123046875, "learning_rate": 0.1, "loss": 2.453047513961792, "step": 2366 }, { "epoch": 0.07517460317460317, "grad_norm": 0.10986328125, "learning_rate": 0.1, "loss": 2.444765329360962, "step": 2368 }, { "epoch": 0.07523809523809524, "grad_norm": 0.25, "learning_rate": 0.1, "loss": 2.426901340484619, "step": 2370 }, { "epoch": 0.0753015873015873, "grad_norm": 0.18359375, "learning_rate": 0.1, "loss": 2.424010753631592, "step": 2372 }, { "epoch": 0.07536507936507937, "grad_norm": 0.1845703125, "learning_rate": 0.1, "loss": 2.43820858001709, "step": 2374 }, { "epoch": 0.07542857142857143, "grad_norm": 0.2578125, "learning_rate": 0.1, "loss": 2.4476804733276367, "step": 2376 }, { "epoch": 0.07549206349206349, "grad_norm": 0.255859375, "learning_rate": 0.1, "loss": 2.435978412628174, "step": 2378 }, { "epoch": 0.07555555555555556, "grad_norm": 0.2177734375, "learning_rate": 0.1, "loss": 2.419553518295288, "step": 2380 }, { "epoch": 0.07561904761904761, "grad_norm": 0.2578125, "learning_rate": 0.1, "loss": 2.430812358856201, "step": 2382 }, { "epoch": 0.07568253968253968, "grad_norm": 0.10498046875, "learning_rate": 0.1, "loss": 2.4563260078430176, "step": 2384 }, { "epoch": 0.07574603174603174, "grad_norm": 0.087890625, "learning_rate": 0.1, "loss": 2.4316368103027344, "step": 2386 }, { "epoch": 0.07580952380952381, "grad_norm": 0.1455078125, "learning_rate": 0.1, "loss": 2.434985399246216, "step": 2388 }, { "epoch": 0.07587301587301587, "grad_norm": 0.2890625, "learning_rate": 0.1, "loss": 2.440274477005005, "step": 2390 }, { "epoch": 0.07593650793650794, "grad_norm": 0.25390625, "learning_rate": 0.1, "loss": 2.4323530197143555, "step": 2392 }, { "epoch": 0.076, "grad_norm": 0.1357421875, "learning_rate": 0.1, "loss": 2.4373326301574707, "step": 2394 }, { "epoch": 0.07606349206349207, "grad_norm": 0.1669921875, "learning_rate": 0.1, "loss": 2.4207451343536377, "step": 2396 }, { "epoch": 0.07612698412698413, "grad_norm": 0.17578125, "learning_rate": 0.1, "loss": 2.420163869857788, "step": 2398 }, { "epoch": 0.0761904761904762, "grad_norm": 0.10205078125, "learning_rate": 0.1, "loss": 2.439526319503784, "step": 2400 }, { "epoch": 0.07625396825396825, "grad_norm": 0.07568359375, "learning_rate": 0.1, "loss": 2.4116086959838867, "step": 2402 }, { "epoch": 0.07631746031746031, "grad_norm": 0.07958984375, "learning_rate": 0.1, "loss": 2.4341793060302734, "step": 2404 }, { "epoch": 0.07638095238095238, "grad_norm": 0.345703125, "learning_rate": 0.1, "loss": 2.4284491539001465, "step": 2406 }, { "epoch": 0.07644444444444444, "grad_norm": 0.62109375, "learning_rate": 0.1, "loss": 2.4533045291900635, "step": 2408 }, { "epoch": 0.07650793650793651, "grad_norm": 0.185546875, "learning_rate": 0.1, "loss": 2.47479248046875, "step": 2410 }, { "epoch": 0.07657142857142857, "grad_norm": 0.1630859375, "learning_rate": 0.1, "loss": 2.436018228530884, "step": 2412 }, { "epoch": 0.07663492063492064, "grad_norm": 0.185546875, "learning_rate": 0.1, "loss": 2.423745632171631, "step": 2414 }, { "epoch": 0.0766984126984127, "grad_norm": 0.166015625, "learning_rate": 0.1, "loss": 2.4376609325408936, "step": 2416 }, { "epoch": 0.07676190476190477, "grad_norm": 0.09716796875, "learning_rate": 0.1, "loss": 2.4310152530670166, "step": 2418 }, { "epoch": 0.07682539682539682, "grad_norm": 0.09814453125, "learning_rate": 0.1, "loss": 2.3952605724334717, "step": 2420 }, { "epoch": 0.0768888888888889, "grad_norm": 0.1640625, "learning_rate": 0.1, "loss": 2.428652048110962, "step": 2422 }, { "epoch": 0.07695238095238095, "grad_norm": 0.283203125, "learning_rate": 0.1, "loss": 2.440258741378784, "step": 2424 }, { "epoch": 0.07701587301587301, "grad_norm": 0.2197265625, "learning_rate": 0.1, "loss": 2.444493532180786, "step": 2426 }, { "epoch": 0.07707936507936508, "grad_norm": 0.16796875, "learning_rate": 0.1, "loss": 2.431042194366455, "step": 2428 }, { "epoch": 0.07714285714285714, "grad_norm": 0.1455078125, "learning_rate": 0.1, "loss": 2.4103918075561523, "step": 2430 }, { "epoch": 0.07720634920634921, "grad_norm": 0.21484375, "learning_rate": 0.1, "loss": 2.401892900466919, "step": 2432 }, { "epoch": 0.07726984126984127, "grad_norm": 0.216796875, "learning_rate": 0.1, "loss": 2.4302146434783936, "step": 2434 }, { "epoch": 0.07733333333333334, "grad_norm": 0.14453125, "learning_rate": 0.1, "loss": 2.437680244445801, "step": 2436 }, { "epoch": 0.0773968253968254, "grad_norm": 0.283203125, "learning_rate": 0.1, "loss": 2.412109851837158, "step": 2438 }, { "epoch": 0.07746031746031747, "grad_norm": 0.1923828125, "learning_rate": 0.1, "loss": 2.4031805992126465, "step": 2440 }, { "epoch": 0.07752380952380952, "grad_norm": 0.275390625, "learning_rate": 0.1, "loss": 2.4190900325775146, "step": 2442 }, { "epoch": 0.0775873015873016, "grad_norm": 0.169921875, "learning_rate": 0.1, "loss": 2.4275946617126465, "step": 2444 }, { "epoch": 0.07765079365079365, "grad_norm": 0.0986328125, "learning_rate": 0.1, "loss": 2.4430408477783203, "step": 2446 }, { "epoch": 0.07771428571428571, "grad_norm": 0.10009765625, "learning_rate": 0.1, "loss": 2.4210867881774902, "step": 2448 }, { "epoch": 0.07777777777777778, "grad_norm": 0.12451171875, "learning_rate": 0.1, "loss": 2.430661916732788, "step": 2450 }, { "epoch": 0.07784126984126984, "grad_norm": 0.1611328125, "learning_rate": 0.1, "loss": 2.420783042907715, "step": 2452 }, { "epoch": 0.07790476190476191, "grad_norm": 0.1787109375, "learning_rate": 0.1, "loss": 2.4149372577667236, "step": 2454 }, { "epoch": 0.07796825396825396, "grad_norm": 0.2138671875, "learning_rate": 0.1, "loss": 2.4412002563476562, "step": 2456 }, { "epoch": 0.07803174603174604, "grad_norm": 0.275390625, "learning_rate": 0.1, "loss": 2.435023546218872, "step": 2458 }, { "epoch": 0.07809523809523809, "grad_norm": 0.205078125, "learning_rate": 0.1, "loss": 2.4161536693573, "step": 2460 }, { "epoch": 0.07815873015873016, "grad_norm": 0.0859375, "learning_rate": 0.1, "loss": 2.4357218742370605, "step": 2462 }, { "epoch": 0.07822222222222222, "grad_norm": 0.333984375, "learning_rate": 0.1, "loss": 2.4202427864074707, "step": 2464 }, { "epoch": 0.07828571428571429, "grad_norm": 0.8828125, "learning_rate": 0.1, "loss": 2.4187259674072266, "step": 2466 }, { "epoch": 0.07834920634920635, "grad_norm": 0.173828125, "learning_rate": 0.1, "loss": 2.4365692138671875, "step": 2468 }, { "epoch": 0.0784126984126984, "grad_norm": 0.08203125, "learning_rate": 0.1, "loss": 2.4217774868011475, "step": 2470 }, { "epoch": 0.07847619047619048, "grad_norm": 0.09912109375, "learning_rate": 0.1, "loss": 2.4398744106292725, "step": 2472 }, { "epoch": 0.07853968253968253, "grad_norm": 0.0908203125, "learning_rate": 0.1, "loss": 2.429107904434204, "step": 2474 }, { "epoch": 0.0786031746031746, "grad_norm": 0.125, "learning_rate": 0.1, "loss": 2.4286813735961914, "step": 2476 }, { "epoch": 0.07866666666666666, "grad_norm": 0.11669921875, "learning_rate": 0.1, "loss": 2.419365167617798, "step": 2478 }, { "epoch": 0.07873015873015873, "grad_norm": 0.1162109375, "learning_rate": 0.1, "loss": 2.421030044555664, "step": 2480 }, { "epoch": 0.07879365079365079, "grad_norm": 0.083984375, "learning_rate": 0.1, "loss": 2.454935073852539, "step": 2482 }, { "epoch": 0.07885714285714286, "grad_norm": 0.1494140625, "learning_rate": 0.1, "loss": 2.4285285472869873, "step": 2484 }, { "epoch": 0.07892063492063492, "grad_norm": 0.1318359375, "learning_rate": 0.1, "loss": 2.4191834926605225, "step": 2486 }, { "epoch": 0.07898412698412699, "grad_norm": 0.1279296875, "learning_rate": 0.1, "loss": 2.4343907833099365, "step": 2488 }, { "epoch": 0.07904761904761905, "grad_norm": 0.1171875, "learning_rate": 0.1, "loss": 2.433274030685425, "step": 2490 }, { "epoch": 0.0791111111111111, "grad_norm": 0.291015625, "learning_rate": 0.1, "loss": 2.421208620071411, "step": 2492 }, { "epoch": 0.07917460317460318, "grad_norm": 0.228515625, "learning_rate": 0.1, "loss": 2.4023795127868652, "step": 2494 }, { "epoch": 0.07923809523809523, "grad_norm": 0.123046875, "learning_rate": 0.1, "loss": 2.4026215076446533, "step": 2496 }, { "epoch": 0.0793015873015873, "grad_norm": 0.1708984375, "learning_rate": 0.1, "loss": 2.43749737739563, "step": 2498 }, { "epoch": 0.07936507936507936, "grad_norm": 0.15234375, "learning_rate": 0.1, "loss": 2.4021148681640625, "step": 2500 }, { "epoch": 0.07942857142857143, "grad_norm": 0.126953125, "learning_rate": 0.1, "loss": 2.4000911712646484, "step": 2502 }, { "epoch": 0.07949206349206349, "grad_norm": 0.07763671875, "learning_rate": 0.1, "loss": 2.4468750953674316, "step": 2504 }, { "epoch": 0.07955555555555556, "grad_norm": 0.2177734375, "learning_rate": 0.1, "loss": 2.4073941707611084, "step": 2506 }, { "epoch": 0.07961904761904762, "grad_norm": 0.44921875, "learning_rate": 0.1, "loss": 2.3922007083892822, "step": 2508 }, { "epoch": 0.07968253968253969, "grad_norm": 0.2001953125, "learning_rate": 0.1, "loss": 2.424196481704712, "step": 2510 }, { "epoch": 0.07974603174603175, "grad_norm": 0.12109375, "learning_rate": 0.1, "loss": 2.4011919498443604, "step": 2512 }, { "epoch": 0.07980952380952382, "grad_norm": 0.1650390625, "learning_rate": 0.1, "loss": 2.427478790283203, "step": 2514 }, { "epoch": 0.07987301587301587, "grad_norm": 0.189453125, "learning_rate": 0.1, "loss": 2.3962597846984863, "step": 2516 }, { "epoch": 0.07993650793650793, "grad_norm": 0.2197265625, "learning_rate": 0.1, "loss": 2.39072585105896, "step": 2518 }, { "epoch": 0.08, "grad_norm": 0.1044921875, "learning_rate": 0.1, "loss": 2.4374947547912598, "step": 2520 }, { "epoch": 0.08006349206349206, "grad_norm": 0.265625, "learning_rate": 0.1, "loss": 2.429957151412964, "step": 2522 }, { "epoch": 0.08012698412698413, "grad_norm": 0.375, "learning_rate": 0.1, "loss": 2.413072109222412, "step": 2524 }, { "epoch": 0.08019047619047619, "grad_norm": 0.2080078125, "learning_rate": 0.1, "loss": 2.4154856204986572, "step": 2526 }, { "epoch": 0.08025396825396826, "grad_norm": 0.111328125, "learning_rate": 0.1, "loss": 2.445281744003296, "step": 2528 }, { "epoch": 0.08031746031746032, "grad_norm": 0.140625, "learning_rate": 0.1, "loss": 2.4195079803466797, "step": 2530 }, { "epoch": 0.08038095238095239, "grad_norm": 0.1982421875, "learning_rate": 0.1, "loss": 2.4104957580566406, "step": 2532 }, { "epoch": 0.08044444444444444, "grad_norm": 0.30078125, "learning_rate": 0.1, "loss": 2.408351421356201, "step": 2534 }, { "epoch": 0.08050793650793651, "grad_norm": 0.171875, "learning_rate": 0.1, "loss": 2.4167144298553467, "step": 2536 }, { "epoch": 0.08057142857142857, "grad_norm": 0.09033203125, "learning_rate": 0.1, "loss": 2.4040238857269287, "step": 2538 }, { "epoch": 0.08063492063492063, "grad_norm": 0.12109375, "learning_rate": 0.1, "loss": 2.4108786582946777, "step": 2540 }, { "epoch": 0.0806984126984127, "grad_norm": 0.216796875, "learning_rate": 0.1, "loss": 2.4052939414978027, "step": 2542 }, { "epoch": 0.08076190476190476, "grad_norm": 0.189453125, "learning_rate": 0.1, "loss": 2.4246435165405273, "step": 2544 }, { "epoch": 0.08082539682539683, "grad_norm": 0.400390625, "learning_rate": 0.1, "loss": 2.412308692932129, "step": 2546 }, { "epoch": 0.08088888888888889, "grad_norm": 0.1845703125, "learning_rate": 0.1, "loss": 2.3998472690582275, "step": 2548 }, { "epoch": 0.08095238095238096, "grad_norm": 0.12890625, "learning_rate": 0.1, "loss": 2.4107229709625244, "step": 2550 }, { "epoch": 0.08101587301587301, "grad_norm": 0.099609375, "learning_rate": 0.1, "loss": 2.4273386001586914, "step": 2552 }, { "epoch": 0.08107936507936508, "grad_norm": 0.1025390625, "learning_rate": 0.1, "loss": 2.399528980255127, "step": 2554 }, { "epoch": 0.08114285714285714, "grad_norm": 0.0888671875, "learning_rate": 0.1, "loss": 2.446045398712158, "step": 2556 }, { "epoch": 0.08120634920634921, "grad_norm": 0.08447265625, "learning_rate": 0.1, "loss": 2.3881447315216064, "step": 2558 }, { "epoch": 0.08126984126984127, "grad_norm": 0.1787109375, "learning_rate": 0.1, "loss": 2.419867515563965, "step": 2560 }, { "epoch": 0.08133333333333333, "grad_norm": 0.154296875, "learning_rate": 0.1, "loss": 2.384958505630493, "step": 2562 }, { "epoch": 0.0813968253968254, "grad_norm": 0.083984375, "learning_rate": 0.1, "loss": 2.397991895675659, "step": 2564 }, { "epoch": 0.08146031746031746, "grad_norm": 0.134765625, "learning_rate": 0.1, "loss": 2.422476053237915, "step": 2566 }, { "epoch": 0.08152380952380953, "grad_norm": 0.2060546875, "learning_rate": 0.1, "loss": 2.397609233856201, "step": 2568 }, { "epoch": 0.08158730158730158, "grad_norm": 0.470703125, "learning_rate": 0.1, "loss": 2.416428804397583, "step": 2570 }, { "epoch": 0.08165079365079365, "grad_norm": 0.2890625, "learning_rate": 0.1, "loss": 2.438084363937378, "step": 2572 }, { "epoch": 0.08171428571428571, "grad_norm": 0.1806640625, "learning_rate": 0.1, "loss": 2.4178433418273926, "step": 2574 }, { "epoch": 0.08177777777777778, "grad_norm": 0.2119140625, "learning_rate": 0.1, "loss": 2.3941657543182373, "step": 2576 }, { "epoch": 0.08184126984126984, "grad_norm": 0.3125, "learning_rate": 0.1, "loss": 2.4136197566986084, "step": 2578 }, { "epoch": 0.08190476190476191, "grad_norm": 0.3046875, "learning_rate": 0.1, "loss": 2.4213650226593018, "step": 2580 }, { "epoch": 0.08196825396825397, "grad_norm": 0.115234375, "learning_rate": 0.1, "loss": 2.408294677734375, "step": 2582 }, { "epoch": 0.08203174603174603, "grad_norm": 0.287109375, "learning_rate": 0.1, "loss": 2.4357292652130127, "step": 2584 }, { "epoch": 0.0820952380952381, "grad_norm": 0.291015625, "learning_rate": 0.1, "loss": 2.4219906330108643, "step": 2586 }, { "epoch": 0.08215873015873015, "grad_norm": 0.10546875, "learning_rate": 0.1, "loss": 2.426844596862793, "step": 2588 }, { "epoch": 0.08222222222222222, "grad_norm": 0.0908203125, "learning_rate": 0.1, "loss": 2.416428804397583, "step": 2590 }, { "epoch": 0.08228571428571428, "grad_norm": 0.0908203125, "learning_rate": 0.1, "loss": 2.4195680618286133, "step": 2592 }, { "epoch": 0.08234920634920635, "grad_norm": 0.123046875, "learning_rate": 0.1, "loss": 2.407853126525879, "step": 2594 }, { "epoch": 0.08241269841269841, "grad_norm": 0.2470703125, "learning_rate": 0.1, "loss": 2.3943076133728027, "step": 2596 }, { "epoch": 0.08247619047619048, "grad_norm": 0.30859375, "learning_rate": 0.1, "loss": 2.412315845489502, "step": 2598 }, { "epoch": 0.08253968253968254, "grad_norm": 0.0810546875, "learning_rate": 0.1, "loss": 2.415006399154663, "step": 2600 }, { "epoch": 0.08260317460317461, "grad_norm": 0.08740234375, "learning_rate": 0.1, "loss": 2.412538528442383, "step": 2602 }, { "epoch": 0.08266666666666667, "grad_norm": 0.10107421875, "learning_rate": 0.1, "loss": 2.428304433822632, "step": 2604 }, { "epoch": 0.08273015873015872, "grad_norm": 0.162109375, "learning_rate": 0.1, "loss": 2.4306912422180176, "step": 2606 }, { "epoch": 0.0827936507936508, "grad_norm": 0.474609375, "learning_rate": 0.1, "loss": 2.395228147506714, "step": 2608 }, { "epoch": 0.08285714285714285, "grad_norm": 0.2255859375, "learning_rate": 0.1, "loss": 2.3916776180267334, "step": 2610 }, { "epoch": 0.08292063492063492, "grad_norm": 0.07275390625, "learning_rate": 0.1, "loss": 2.409454822540283, "step": 2612 }, { "epoch": 0.08298412698412698, "grad_norm": 0.08203125, "learning_rate": 0.1, "loss": 2.3860230445861816, "step": 2614 }, { "epoch": 0.08304761904761905, "grad_norm": 0.185546875, "learning_rate": 0.1, "loss": 2.424116849899292, "step": 2616 }, { "epoch": 0.08311111111111111, "grad_norm": 0.271484375, "learning_rate": 0.1, "loss": 2.4006242752075195, "step": 2618 }, { "epoch": 0.08317460317460318, "grad_norm": 0.16796875, "learning_rate": 0.1, "loss": 2.397615432739258, "step": 2620 }, { "epoch": 0.08323809523809524, "grad_norm": 0.1240234375, "learning_rate": 0.1, "loss": 2.423779249191284, "step": 2622 }, { "epoch": 0.08330158730158731, "grad_norm": 0.2412109375, "learning_rate": 0.1, "loss": 2.3777194023132324, "step": 2624 }, { "epoch": 0.08336507936507936, "grad_norm": 0.40234375, "learning_rate": 0.1, "loss": 2.4126791954040527, "step": 2626 }, { "epoch": 0.08342857142857144, "grad_norm": 0.29296875, "learning_rate": 0.1, "loss": 2.415865898132324, "step": 2628 }, { "epoch": 0.08349206349206349, "grad_norm": 0.0908203125, "learning_rate": 0.1, "loss": 2.455211877822876, "step": 2630 }, { "epoch": 0.08355555555555555, "grad_norm": 0.09814453125, "learning_rate": 0.1, "loss": 2.427333116531372, "step": 2632 }, { "epoch": 0.08361904761904762, "grad_norm": 0.09912109375, "learning_rate": 0.1, "loss": 2.4025228023529053, "step": 2634 }, { "epoch": 0.08368253968253968, "grad_norm": 0.1611328125, "learning_rate": 0.1, "loss": 2.3941171169281006, "step": 2636 }, { "epoch": 0.08374603174603175, "grad_norm": 0.2021484375, "learning_rate": 0.1, "loss": 2.403909206390381, "step": 2638 }, { "epoch": 0.0838095238095238, "grad_norm": 0.431640625, "learning_rate": 0.1, "loss": 2.4368860721588135, "step": 2640 }, { "epoch": 0.08387301587301588, "grad_norm": 0.220703125, "learning_rate": 0.1, "loss": 2.412644147872925, "step": 2642 }, { "epoch": 0.08393650793650793, "grad_norm": 0.09716796875, "learning_rate": 0.1, "loss": 2.4059386253356934, "step": 2644 }, { "epoch": 0.084, "grad_norm": 0.09130859375, "learning_rate": 0.1, "loss": 2.4038009643554688, "step": 2646 }, { "epoch": 0.08406349206349206, "grad_norm": 0.07080078125, "learning_rate": 0.1, "loss": 2.411282539367676, "step": 2648 }, { "epoch": 0.08412698412698413, "grad_norm": 0.123046875, "learning_rate": 0.1, "loss": 2.4046213626861572, "step": 2650 }, { "epoch": 0.08419047619047619, "grad_norm": 0.2138671875, "learning_rate": 0.1, "loss": 2.402143716812134, "step": 2652 }, { "epoch": 0.08425396825396825, "grad_norm": 0.33203125, "learning_rate": 0.1, "loss": 2.4237313270568848, "step": 2654 }, { "epoch": 0.08431746031746032, "grad_norm": 0.283203125, "learning_rate": 0.1, "loss": 2.38344669342041, "step": 2656 }, { "epoch": 0.08438095238095238, "grad_norm": 0.09814453125, "learning_rate": 0.1, "loss": 2.3954100608825684, "step": 2658 }, { "epoch": 0.08444444444444445, "grad_norm": 0.09375, "learning_rate": 0.1, "loss": 2.4000606536865234, "step": 2660 }, { "epoch": 0.0845079365079365, "grad_norm": 0.1767578125, "learning_rate": 0.1, "loss": 2.393099069595337, "step": 2662 }, { "epoch": 0.08457142857142858, "grad_norm": 0.306640625, "learning_rate": 0.1, "loss": 2.3886022567749023, "step": 2664 }, { "epoch": 0.08463492063492063, "grad_norm": 0.349609375, "learning_rate": 0.1, "loss": 2.397421360015869, "step": 2666 }, { "epoch": 0.0846984126984127, "grad_norm": 0.13671875, "learning_rate": 0.1, "loss": 2.4051928520202637, "step": 2668 }, { "epoch": 0.08476190476190476, "grad_norm": 0.1328125, "learning_rate": 0.1, "loss": 2.4021687507629395, "step": 2670 }, { "epoch": 0.08482539682539683, "grad_norm": 0.23828125, "learning_rate": 0.1, "loss": 2.399261951446533, "step": 2672 }, { "epoch": 0.08488888888888889, "grad_norm": 0.13671875, "learning_rate": 0.1, "loss": 2.399930715560913, "step": 2674 }, { "epoch": 0.08495238095238095, "grad_norm": 0.0703125, "learning_rate": 0.1, "loss": 2.3806633949279785, "step": 2676 }, { "epoch": 0.08501587301587302, "grad_norm": 0.341796875, "learning_rate": 0.1, "loss": 2.411097526550293, "step": 2678 }, { "epoch": 0.08507936507936507, "grad_norm": 0.4921875, "learning_rate": 0.1, "loss": 2.3698480129241943, "step": 2680 }, { "epoch": 0.08514285714285715, "grad_norm": 0.2578125, "learning_rate": 0.1, "loss": 2.4031405448913574, "step": 2682 }, { "epoch": 0.0852063492063492, "grad_norm": 0.2109375, "learning_rate": 0.1, "loss": 2.3754119873046875, "step": 2684 }, { "epoch": 0.08526984126984127, "grad_norm": 0.169921875, "learning_rate": 0.1, "loss": 2.4002645015716553, "step": 2686 }, { "epoch": 0.08533333333333333, "grad_norm": 0.0947265625, "learning_rate": 0.1, "loss": 2.412679672241211, "step": 2688 }, { "epoch": 0.0853968253968254, "grad_norm": 0.1484375, "learning_rate": 0.1, "loss": 2.3935630321502686, "step": 2690 }, { "epoch": 0.08546031746031746, "grad_norm": 0.11474609375, "learning_rate": 0.1, "loss": 2.3918657302856445, "step": 2692 }, { "epoch": 0.08552380952380953, "grad_norm": 0.11962890625, "learning_rate": 0.1, "loss": 2.3883056640625, "step": 2694 }, { "epoch": 0.08558730158730159, "grad_norm": 0.10693359375, "learning_rate": 0.1, "loss": 2.367241382598877, "step": 2696 }, { "epoch": 0.08565079365079364, "grad_norm": 0.20703125, "learning_rate": 0.1, "loss": 2.4024059772491455, "step": 2698 }, { "epoch": 0.08571428571428572, "grad_norm": 0.259765625, "learning_rate": 0.1, "loss": 2.402935266494751, "step": 2700 }, { "epoch": 0.08577777777777777, "grad_norm": 0.203125, "learning_rate": 0.1, "loss": 2.3991940021514893, "step": 2702 }, { "epoch": 0.08584126984126984, "grad_norm": 0.10498046875, "learning_rate": 0.1, "loss": 2.3972902297973633, "step": 2704 }, { "epoch": 0.0859047619047619, "grad_norm": 0.1669921875, "learning_rate": 0.1, "loss": 2.3746674060821533, "step": 2706 }, { "epoch": 0.08596825396825397, "grad_norm": 0.154296875, "learning_rate": 0.1, "loss": 2.369091272354126, "step": 2708 }, { "epoch": 0.08603174603174603, "grad_norm": 0.10888671875, "learning_rate": 0.1, "loss": 2.3942418098449707, "step": 2710 }, { "epoch": 0.0860952380952381, "grad_norm": 0.236328125, "learning_rate": 0.1, "loss": 2.3859245777130127, "step": 2712 }, { "epoch": 0.08615873015873016, "grad_norm": 0.29296875, "learning_rate": 0.1, "loss": 2.391073226928711, "step": 2714 }, { "epoch": 0.08622222222222223, "grad_norm": 0.2333984375, "learning_rate": 0.1, "loss": 2.3894689083099365, "step": 2716 }, { "epoch": 0.08628571428571429, "grad_norm": 0.1865234375, "learning_rate": 0.1, "loss": 2.394273042678833, "step": 2718 }, { "epoch": 0.08634920634920636, "grad_norm": 0.2734375, "learning_rate": 0.1, "loss": 2.376675605773926, "step": 2720 }, { "epoch": 0.08641269841269841, "grad_norm": 0.06396484375, "learning_rate": 0.1, "loss": 2.379934549331665, "step": 2722 }, { "epoch": 0.08647619047619047, "grad_norm": 0.11669921875, "learning_rate": 0.1, "loss": 2.370460271835327, "step": 2724 }, { "epoch": 0.08653968253968254, "grad_norm": 0.169921875, "learning_rate": 0.1, "loss": 2.400240182876587, "step": 2726 }, { "epoch": 0.0866031746031746, "grad_norm": 0.11474609375, "learning_rate": 0.1, "loss": 2.4172515869140625, "step": 2728 }, { "epoch": 0.08666666666666667, "grad_norm": 0.1884765625, "learning_rate": 0.1, "loss": 2.380107879638672, "step": 2730 }, { "epoch": 0.08673015873015873, "grad_norm": 0.2373046875, "learning_rate": 0.1, "loss": 2.364217758178711, "step": 2732 }, { "epoch": 0.0867936507936508, "grad_norm": 0.1533203125, "learning_rate": 0.1, "loss": 2.3903818130493164, "step": 2734 }, { "epoch": 0.08685714285714285, "grad_norm": 0.2001953125, "learning_rate": 0.1, "loss": 2.3767409324645996, "step": 2736 }, { "epoch": 0.08692063492063493, "grad_norm": 0.3671875, "learning_rate": 0.1, "loss": 2.398111343383789, "step": 2738 }, { "epoch": 0.08698412698412698, "grad_norm": 0.365234375, "learning_rate": 0.1, "loss": 2.3614792823791504, "step": 2740 }, { "epoch": 0.08704761904761905, "grad_norm": 0.130859375, "learning_rate": 0.1, "loss": 2.374438524246216, "step": 2742 }, { "epoch": 0.08711111111111111, "grad_norm": 0.1240234375, "learning_rate": 0.1, "loss": 2.361042022705078, "step": 2744 }, { "epoch": 0.08717460317460317, "grad_norm": 0.162109375, "learning_rate": 0.1, "loss": 2.381786584854126, "step": 2746 }, { "epoch": 0.08723809523809524, "grad_norm": 0.1689453125, "learning_rate": 0.1, "loss": 2.397627353668213, "step": 2748 }, { "epoch": 0.0873015873015873, "grad_norm": 0.220703125, "learning_rate": 0.1, "loss": 2.4032721519470215, "step": 2750 }, { "epoch": 0.08736507936507937, "grad_norm": 0.111328125, "learning_rate": 0.1, "loss": 2.3714730739593506, "step": 2752 }, { "epoch": 0.08742857142857142, "grad_norm": 0.08544921875, "learning_rate": 0.1, "loss": 2.390585422515869, "step": 2754 }, { "epoch": 0.0874920634920635, "grad_norm": 0.0751953125, "learning_rate": 0.1, "loss": 2.394010066986084, "step": 2756 }, { "epoch": 0.08755555555555555, "grad_norm": 0.1142578125, "learning_rate": 0.1, "loss": 2.4022858142852783, "step": 2758 }, { "epoch": 0.08761904761904762, "grad_norm": 0.26171875, "learning_rate": 0.1, "loss": 2.381974220275879, "step": 2760 }, { "epoch": 0.08768253968253968, "grad_norm": 0.283203125, "learning_rate": 0.1, "loss": 2.4119536876678467, "step": 2762 }, { "epoch": 0.08774603174603175, "grad_norm": 0.1103515625, "learning_rate": 0.1, "loss": 2.384192943572998, "step": 2764 }, { "epoch": 0.08780952380952381, "grad_norm": 0.2294921875, "learning_rate": 0.1, "loss": 2.4055330753326416, "step": 2766 }, { "epoch": 0.08787301587301587, "grad_norm": 0.08251953125, "learning_rate": 0.1, "loss": 2.3810219764709473, "step": 2768 }, { "epoch": 0.08793650793650794, "grad_norm": 0.158203125, "learning_rate": 0.1, "loss": 2.3948256969451904, "step": 2770 }, { "epoch": 0.088, "grad_norm": 0.40234375, "learning_rate": 0.1, "loss": 2.387455940246582, "step": 2772 }, { "epoch": 0.08806349206349207, "grad_norm": 0.328125, "learning_rate": 0.1, "loss": 2.3881478309631348, "step": 2774 }, { "epoch": 0.08812698412698412, "grad_norm": 0.1806640625, "learning_rate": 0.1, "loss": 2.398066997528076, "step": 2776 }, { "epoch": 0.0881904761904762, "grad_norm": 0.322265625, "learning_rate": 0.1, "loss": 2.3875157833099365, "step": 2778 }, { "epoch": 0.08825396825396825, "grad_norm": 0.3046875, "learning_rate": 0.1, "loss": 2.3789241313934326, "step": 2780 }, { "epoch": 0.08831746031746032, "grad_norm": 0.4375, "learning_rate": 0.1, "loss": 2.3841495513916016, "step": 2782 }, { "epoch": 0.08838095238095238, "grad_norm": 0.31640625, "learning_rate": 0.1, "loss": 2.364940881729126, "step": 2784 }, { "epoch": 0.08844444444444445, "grad_norm": 0.130859375, "learning_rate": 0.1, "loss": 2.4076099395751953, "step": 2786 }, { "epoch": 0.08850793650793651, "grad_norm": 0.14453125, "learning_rate": 0.1, "loss": 2.3738746643066406, "step": 2788 }, { "epoch": 0.08857142857142856, "grad_norm": 0.1435546875, "learning_rate": 0.1, "loss": 2.4016308784484863, "step": 2790 }, { "epoch": 0.08863492063492064, "grad_norm": 0.10302734375, "learning_rate": 0.1, "loss": 2.3824143409729004, "step": 2792 }, { "epoch": 0.08869841269841269, "grad_norm": 0.1484375, "learning_rate": 0.1, "loss": 2.3976755142211914, "step": 2794 }, { "epoch": 0.08876190476190476, "grad_norm": 0.173828125, "learning_rate": 0.1, "loss": 2.3839035034179688, "step": 2796 }, { "epoch": 0.08882539682539682, "grad_norm": 0.07275390625, "learning_rate": 0.1, "loss": 2.3902008533477783, "step": 2798 }, { "epoch": 0.08888888888888889, "grad_norm": 0.119140625, "learning_rate": 0.1, "loss": 2.404540538787842, "step": 2800 }, { "epoch": 0.08895238095238095, "grad_norm": 0.1513671875, "learning_rate": 0.1, "loss": 2.397319793701172, "step": 2802 }, { "epoch": 0.08901587301587302, "grad_norm": 0.142578125, "learning_rate": 0.1, "loss": 2.3604745864868164, "step": 2804 }, { "epoch": 0.08907936507936508, "grad_norm": 0.1015625, "learning_rate": 0.1, "loss": 2.3771674633026123, "step": 2806 }, { "epoch": 0.08914285714285715, "grad_norm": 0.142578125, "learning_rate": 0.1, "loss": 2.371016502380371, "step": 2808 }, { "epoch": 0.0892063492063492, "grad_norm": 0.25390625, "learning_rate": 0.1, "loss": 2.400768518447876, "step": 2810 }, { "epoch": 0.08926984126984126, "grad_norm": 0.0654296875, "learning_rate": 0.1, "loss": 2.3920140266418457, "step": 2812 }, { "epoch": 0.08933333333333333, "grad_norm": 0.166015625, "learning_rate": 0.1, "loss": 2.3702197074890137, "step": 2814 }, { "epoch": 0.08939682539682539, "grad_norm": 0.197265625, "learning_rate": 0.1, "loss": 2.3955814838409424, "step": 2816 }, { "epoch": 0.08946031746031746, "grad_norm": 0.1826171875, "learning_rate": 0.1, "loss": 2.388280153274536, "step": 2818 }, { "epoch": 0.08952380952380952, "grad_norm": 0.1083984375, "learning_rate": 0.1, "loss": 2.356153726577759, "step": 2820 }, { "epoch": 0.08958730158730159, "grad_norm": 0.1005859375, "learning_rate": 0.1, "loss": 2.3995931148529053, "step": 2822 }, { "epoch": 0.08965079365079365, "grad_norm": 0.54296875, "learning_rate": 0.1, "loss": 2.3900046348571777, "step": 2824 }, { "epoch": 0.08971428571428572, "grad_norm": 0.490234375, "learning_rate": 0.1, "loss": 2.388975143432617, "step": 2826 }, { "epoch": 0.08977777777777778, "grad_norm": 0.059326171875, "learning_rate": 0.1, "loss": 2.3842997550964355, "step": 2828 }, { "epoch": 0.08984126984126985, "grad_norm": 0.10595703125, "learning_rate": 0.1, "loss": 2.3644092082977295, "step": 2830 }, { "epoch": 0.0899047619047619, "grad_norm": 0.162109375, "learning_rate": 0.1, "loss": 2.3734235763549805, "step": 2832 }, { "epoch": 0.08996825396825398, "grad_norm": 0.185546875, "learning_rate": 0.1, "loss": 2.369443416595459, "step": 2834 }, { "epoch": 0.09003174603174603, "grad_norm": 0.2109375, "learning_rate": 0.1, "loss": 2.406440258026123, "step": 2836 }, { "epoch": 0.09009523809523809, "grad_norm": 0.1953125, "learning_rate": 0.1, "loss": 2.3799054622650146, "step": 2838 }, { "epoch": 0.09015873015873016, "grad_norm": 0.1123046875, "learning_rate": 0.1, "loss": 2.3945982456207275, "step": 2840 }, { "epoch": 0.09022222222222222, "grad_norm": 0.1611328125, "learning_rate": 0.1, "loss": 2.3752596378326416, "step": 2842 }, { "epoch": 0.09028571428571429, "grad_norm": 0.25390625, "learning_rate": 0.1, "loss": 2.3628222942352295, "step": 2844 }, { "epoch": 0.09034920634920635, "grad_norm": 0.294921875, "learning_rate": 0.1, "loss": 2.383291244506836, "step": 2846 }, { "epoch": 0.09041269841269842, "grad_norm": 0.07373046875, "learning_rate": 0.1, "loss": 2.3690989017486572, "step": 2848 }, { "epoch": 0.09047619047619047, "grad_norm": 0.11083984375, "learning_rate": 0.1, "loss": 2.3845341205596924, "step": 2850 }, { "epoch": 0.09053968253968254, "grad_norm": 0.126953125, "learning_rate": 0.1, "loss": 2.3914670944213867, "step": 2852 }, { "epoch": 0.0906031746031746, "grad_norm": 0.0625, "learning_rate": 0.1, "loss": 2.3840696811676025, "step": 2854 }, { "epoch": 0.09066666666666667, "grad_norm": 0.154296875, "learning_rate": 0.1, "loss": 2.399721622467041, "step": 2856 }, { "epoch": 0.09073015873015873, "grad_norm": 0.1103515625, "learning_rate": 0.1, "loss": 2.3811497688293457, "step": 2858 }, { "epoch": 0.09079365079365079, "grad_norm": 0.1396484375, "learning_rate": 0.1, "loss": 2.3901965618133545, "step": 2860 }, { "epoch": 0.09085714285714286, "grad_norm": 0.193359375, "learning_rate": 0.1, "loss": 2.3706390857696533, "step": 2862 }, { "epoch": 0.09092063492063492, "grad_norm": 0.3125, "learning_rate": 0.1, "loss": 2.3982973098754883, "step": 2864 }, { "epoch": 0.09098412698412699, "grad_norm": 0.1572265625, "learning_rate": 0.1, "loss": 2.3944590091705322, "step": 2866 }, { "epoch": 0.09104761904761904, "grad_norm": 0.1923828125, "learning_rate": 0.1, "loss": 2.382413625717163, "step": 2868 }, { "epoch": 0.09111111111111111, "grad_norm": 0.328125, "learning_rate": 0.1, "loss": 2.3877480030059814, "step": 2870 }, { "epoch": 0.09117460317460317, "grad_norm": 0.2119140625, "learning_rate": 0.1, "loss": 2.378044605255127, "step": 2872 }, { "epoch": 0.09123809523809524, "grad_norm": 0.2080078125, "learning_rate": 0.1, "loss": 2.3949317932128906, "step": 2874 }, { "epoch": 0.0913015873015873, "grad_norm": 0.392578125, "learning_rate": 0.1, "loss": 2.3714022636413574, "step": 2876 }, { "epoch": 0.09136507936507937, "grad_norm": 0.048828125, "learning_rate": 0.1, "loss": 2.3776369094848633, "step": 2878 }, { "epoch": 0.09142857142857143, "grad_norm": 0.17578125, "learning_rate": 0.1, "loss": 2.3884694576263428, "step": 2880 }, { "epoch": 0.09149206349206349, "grad_norm": 0.2275390625, "learning_rate": 0.1, "loss": 2.3951470851898193, "step": 2882 }, { "epoch": 0.09155555555555556, "grad_norm": 0.1435546875, "learning_rate": 0.1, "loss": 2.3753321170806885, "step": 2884 }, { "epoch": 0.09161904761904761, "grad_norm": 0.1416015625, "learning_rate": 0.1, "loss": 2.370924472808838, "step": 2886 }, { "epoch": 0.09168253968253968, "grad_norm": 0.08154296875, "learning_rate": 0.1, "loss": 2.3893954753875732, "step": 2888 }, { "epoch": 0.09174603174603174, "grad_norm": 0.1259765625, "learning_rate": 0.1, "loss": 2.387089252471924, "step": 2890 }, { "epoch": 0.09180952380952381, "grad_norm": 0.236328125, "learning_rate": 0.1, "loss": 2.3690972328186035, "step": 2892 }, { "epoch": 0.09187301587301587, "grad_norm": 0.21484375, "learning_rate": 0.1, "loss": 2.3846676349639893, "step": 2894 }, { "epoch": 0.09193650793650794, "grad_norm": 0.1337890625, "learning_rate": 0.1, "loss": 2.401796579360962, "step": 2896 }, { "epoch": 0.092, "grad_norm": 0.3515625, "learning_rate": 0.1, "loss": 2.386007308959961, "step": 2898 }, { "epoch": 0.09206349206349207, "grad_norm": 0.1708984375, "learning_rate": 0.1, "loss": 2.3689541816711426, "step": 2900 }, { "epoch": 0.09212698412698413, "grad_norm": 0.06591796875, "learning_rate": 0.1, "loss": 2.3645076751708984, "step": 2902 }, { "epoch": 0.09219047619047618, "grad_norm": 0.0634765625, "learning_rate": 0.1, "loss": 2.3670029640197754, "step": 2904 }, { "epoch": 0.09225396825396825, "grad_norm": 0.1884765625, "learning_rate": 0.1, "loss": 2.404618501663208, "step": 2906 }, { "epoch": 0.09231746031746031, "grad_norm": 0.36328125, "learning_rate": 0.1, "loss": 2.3593907356262207, "step": 2908 }, { "epoch": 0.09238095238095238, "grad_norm": 0.36328125, "learning_rate": 0.1, "loss": 2.3985025882720947, "step": 2910 }, { "epoch": 0.09244444444444444, "grad_norm": 0.15625, "learning_rate": 0.1, "loss": 2.377531051635742, "step": 2912 }, { "epoch": 0.09250793650793651, "grad_norm": 0.0732421875, "learning_rate": 0.1, "loss": 2.389643669128418, "step": 2914 }, { "epoch": 0.09257142857142857, "grad_norm": 0.1533203125, "learning_rate": 0.1, "loss": 2.391726016998291, "step": 2916 }, { "epoch": 0.09263492063492064, "grad_norm": 0.166015625, "learning_rate": 0.1, "loss": 2.389172315597534, "step": 2918 }, { "epoch": 0.0926984126984127, "grad_norm": 0.0927734375, "learning_rate": 0.1, "loss": 2.411067485809326, "step": 2920 }, { "epoch": 0.09276190476190477, "grad_norm": 0.1845703125, "learning_rate": 0.1, "loss": 2.3761720657348633, "step": 2922 }, { "epoch": 0.09282539682539682, "grad_norm": 0.115234375, "learning_rate": 0.1, "loss": 2.3668196201324463, "step": 2924 }, { "epoch": 0.09288888888888888, "grad_norm": 0.2080078125, "learning_rate": 0.1, "loss": 2.3718135356903076, "step": 2926 }, { "epoch": 0.09295238095238095, "grad_norm": 0.359375, "learning_rate": 0.1, "loss": 2.3895485401153564, "step": 2928 }, { "epoch": 0.09301587301587301, "grad_norm": 0.1806640625, "learning_rate": 0.1, "loss": 2.3770720958709717, "step": 2930 }, { "epoch": 0.09307936507936508, "grad_norm": 0.150390625, "learning_rate": 0.1, "loss": 2.369741201400757, "step": 2932 }, { "epoch": 0.09314285714285714, "grad_norm": 0.095703125, "learning_rate": 0.1, "loss": 2.377316951751709, "step": 2934 }, { "epoch": 0.09320634920634921, "grad_norm": 0.10302734375, "learning_rate": 0.1, "loss": 2.3814334869384766, "step": 2936 }, { "epoch": 0.09326984126984127, "grad_norm": 0.181640625, "learning_rate": 0.1, "loss": 2.376737117767334, "step": 2938 }, { "epoch": 0.09333333333333334, "grad_norm": 0.2021484375, "learning_rate": 0.1, "loss": 2.3867249488830566, "step": 2940 }, { "epoch": 0.0933968253968254, "grad_norm": 0.0947265625, "learning_rate": 0.1, "loss": 2.3841676712036133, "step": 2942 }, { "epoch": 0.09346031746031747, "grad_norm": 0.349609375, "learning_rate": 0.1, "loss": 2.386505603790283, "step": 2944 }, { "epoch": 0.09352380952380952, "grad_norm": 0.390625, "learning_rate": 0.1, "loss": 2.379009962081909, "step": 2946 }, { "epoch": 0.0935873015873016, "grad_norm": 0.2275390625, "learning_rate": 0.1, "loss": 2.382805109024048, "step": 2948 }, { "epoch": 0.09365079365079365, "grad_norm": 0.126953125, "learning_rate": 0.1, "loss": 2.3519742488861084, "step": 2950 }, { "epoch": 0.09371428571428571, "grad_norm": 0.12158203125, "learning_rate": 0.1, "loss": 2.379425525665283, "step": 2952 }, { "epoch": 0.09377777777777778, "grad_norm": 0.076171875, "learning_rate": 0.1, "loss": 2.368734359741211, "step": 2954 }, { "epoch": 0.09384126984126984, "grad_norm": 0.1953125, "learning_rate": 0.1, "loss": 2.375591278076172, "step": 2956 }, { "epoch": 0.09390476190476191, "grad_norm": 0.236328125, "learning_rate": 0.1, "loss": 2.37282395362854, "step": 2958 }, { "epoch": 0.09396825396825396, "grad_norm": 0.201171875, "learning_rate": 0.1, "loss": 2.3608646392822266, "step": 2960 }, { "epoch": 0.09403174603174604, "grad_norm": 0.126953125, "learning_rate": 0.1, "loss": 2.3787221908569336, "step": 2962 }, { "epoch": 0.09409523809523809, "grad_norm": 0.1650390625, "learning_rate": 0.1, "loss": 2.3552496433258057, "step": 2964 }, { "epoch": 0.09415873015873016, "grad_norm": 0.1611328125, "learning_rate": 0.1, "loss": 2.34608793258667, "step": 2966 }, { "epoch": 0.09422222222222222, "grad_norm": 0.322265625, "learning_rate": 0.1, "loss": 2.3694400787353516, "step": 2968 }, { "epoch": 0.09428571428571429, "grad_norm": 0.2138671875, "learning_rate": 0.1, "loss": 2.3759841918945312, "step": 2970 }, { "epoch": 0.09434920634920635, "grad_norm": 0.11962890625, "learning_rate": 0.1, "loss": 2.3792526721954346, "step": 2972 }, { "epoch": 0.0944126984126984, "grad_norm": 0.1796875, "learning_rate": 0.1, "loss": 2.3853352069854736, "step": 2974 }, { "epoch": 0.09447619047619048, "grad_norm": 0.0908203125, "learning_rate": 0.1, "loss": 2.3611159324645996, "step": 2976 }, { "epoch": 0.09453968253968253, "grad_norm": 0.32421875, "learning_rate": 0.1, "loss": 2.353989839553833, "step": 2978 }, { "epoch": 0.0946031746031746, "grad_norm": 0.337890625, "learning_rate": 0.1, "loss": 2.3706610202789307, "step": 2980 }, { "epoch": 0.09466666666666666, "grad_norm": 0.279296875, "learning_rate": 0.1, "loss": 2.3817389011383057, "step": 2982 }, { "epoch": 0.09473015873015873, "grad_norm": 0.0888671875, "learning_rate": 0.1, "loss": 2.369974374771118, "step": 2984 }, { "epoch": 0.09479365079365079, "grad_norm": 0.12109375, "learning_rate": 0.1, "loss": 2.345118761062622, "step": 2986 }, { "epoch": 0.09485714285714286, "grad_norm": 0.20703125, "learning_rate": 0.1, "loss": 2.341881036758423, "step": 2988 }, { "epoch": 0.09492063492063492, "grad_norm": 0.2451171875, "learning_rate": 0.1, "loss": 2.339524030685425, "step": 2990 }, { "epoch": 0.09498412698412699, "grad_norm": 0.294921875, "learning_rate": 0.1, "loss": 2.3656091690063477, "step": 2992 }, { "epoch": 0.09504761904761905, "grad_norm": 0.09326171875, "learning_rate": 0.1, "loss": 2.3573195934295654, "step": 2994 }, { "epoch": 0.0951111111111111, "grad_norm": 0.091796875, "learning_rate": 0.1, "loss": 2.3674745559692383, "step": 2996 }, { "epoch": 0.09517460317460318, "grad_norm": 0.09228515625, "learning_rate": 0.1, "loss": 2.4005837440490723, "step": 2998 }, { "epoch": 0.09523809523809523, "grad_norm": 0.1240234375, "learning_rate": 0.1, "loss": 2.3773844242095947, "step": 3000 }, { "epoch": 0.0953015873015873, "grad_norm": 0.12060546875, "learning_rate": 0.1, "loss": 2.3539998531341553, "step": 3002 }, { "epoch": 0.09536507936507936, "grad_norm": 0.2138671875, "learning_rate": 0.1, "loss": 2.3537511825561523, "step": 3004 }, { "epoch": 0.09542857142857143, "grad_norm": 0.2021484375, "learning_rate": 0.1, "loss": 2.3580195903778076, "step": 3006 }, { "epoch": 0.09549206349206349, "grad_norm": 0.111328125, "learning_rate": 0.1, "loss": 2.3513500690460205, "step": 3008 }, { "epoch": 0.09555555555555556, "grad_norm": 0.349609375, "learning_rate": 0.1, "loss": 2.337217092514038, "step": 3010 }, { "epoch": 0.09561904761904762, "grad_norm": 0.48046875, "learning_rate": 0.1, "loss": 2.36157488822937, "step": 3012 }, { "epoch": 0.09568253968253969, "grad_norm": 0.12451171875, "learning_rate": 0.1, "loss": 2.3663477897644043, "step": 3014 }, { "epoch": 0.09574603174603175, "grad_norm": 0.111328125, "learning_rate": 0.1, "loss": 2.376969575881958, "step": 3016 }, { "epoch": 0.0958095238095238, "grad_norm": 0.10498046875, "learning_rate": 0.1, "loss": 2.3553285598754883, "step": 3018 }, { "epoch": 0.09587301587301587, "grad_norm": 0.1328125, "learning_rate": 0.1, "loss": 2.358536720275879, "step": 3020 }, { "epoch": 0.09593650793650793, "grad_norm": 0.1591796875, "learning_rate": 0.1, "loss": 2.3684332370758057, "step": 3022 }, { "epoch": 0.096, "grad_norm": 0.11572265625, "learning_rate": 0.1, "loss": 2.337360143661499, "step": 3024 }, { "epoch": 0.09606349206349206, "grad_norm": 0.06884765625, "learning_rate": 0.1, "loss": 2.3456897735595703, "step": 3026 }, { "epoch": 0.09612698412698413, "grad_norm": 0.1396484375, "learning_rate": 0.1, "loss": 2.339015483856201, "step": 3028 }, { "epoch": 0.09619047619047619, "grad_norm": 0.265625, "learning_rate": 0.1, "loss": 2.3578503131866455, "step": 3030 }, { "epoch": 0.09625396825396826, "grad_norm": 0.263671875, "learning_rate": 0.1, "loss": 2.351618528366089, "step": 3032 }, { "epoch": 0.09631746031746032, "grad_norm": 0.1845703125, "learning_rate": 0.1, "loss": 2.3544018268585205, "step": 3034 }, { "epoch": 0.09638095238095239, "grad_norm": 0.07421875, "learning_rate": 0.1, "loss": 2.378037691116333, "step": 3036 }, { "epoch": 0.09644444444444444, "grad_norm": 0.07958984375, "learning_rate": 0.1, "loss": 2.3647165298461914, "step": 3038 }, { "epoch": 0.09650793650793651, "grad_norm": 0.1064453125, "learning_rate": 0.1, "loss": 2.3651857376098633, "step": 3040 }, { "epoch": 0.09657142857142857, "grad_norm": 0.1484375, "learning_rate": 0.1, "loss": 2.3736557960510254, "step": 3042 }, { "epoch": 0.09663492063492063, "grad_norm": 0.2294921875, "learning_rate": 0.1, "loss": 2.359473943710327, "step": 3044 }, { "epoch": 0.0966984126984127, "grad_norm": 0.1103515625, "learning_rate": 0.1, "loss": 2.376636028289795, "step": 3046 }, { "epoch": 0.09676190476190476, "grad_norm": 0.4375, "learning_rate": 0.1, "loss": 2.388477087020874, "step": 3048 }, { "epoch": 0.09682539682539683, "grad_norm": 0.466796875, "learning_rate": 0.1, "loss": 2.384114980697632, "step": 3050 }, { "epoch": 0.09688888888888889, "grad_norm": 0.08447265625, "learning_rate": 0.1, "loss": 2.3530313968658447, "step": 3052 }, { "epoch": 0.09695238095238096, "grad_norm": 0.09912109375, "learning_rate": 0.1, "loss": 2.3749337196350098, "step": 3054 }, { "epoch": 0.09701587301587301, "grad_norm": 0.12060546875, "learning_rate": 0.1, "loss": 2.395002841949463, "step": 3056 }, { "epoch": 0.09707936507936508, "grad_norm": 0.1494140625, "learning_rate": 0.1, "loss": 2.366583824157715, "step": 3058 }, { "epoch": 0.09714285714285714, "grad_norm": 0.130859375, "learning_rate": 0.1, "loss": 2.4006776809692383, "step": 3060 }, { "epoch": 0.09720634920634921, "grad_norm": 0.09716796875, "learning_rate": 0.1, "loss": 2.3729727268218994, "step": 3062 }, { "epoch": 0.09726984126984127, "grad_norm": 0.12451171875, "learning_rate": 0.1, "loss": 2.385897159576416, "step": 3064 }, { "epoch": 0.09733333333333333, "grad_norm": 0.138671875, "learning_rate": 0.1, "loss": 2.3621749877929688, "step": 3066 }, { "epoch": 0.0973968253968254, "grad_norm": 0.1796875, "learning_rate": 0.1, "loss": 2.3830955028533936, "step": 3068 }, { "epoch": 0.09746031746031746, "grad_norm": 0.2353515625, "learning_rate": 0.1, "loss": 2.3684816360473633, "step": 3070 }, { "epoch": 0.09752380952380953, "grad_norm": 0.296875, "learning_rate": 0.1, "loss": 2.364788770675659, "step": 3072 }, { "epoch": 0.09758730158730158, "grad_norm": 0.1904296875, "learning_rate": 0.1, "loss": 2.3841049671173096, "step": 3074 }, { "epoch": 0.09765079365079365, "grad_norm": 0.2041015625, "learning_rate": 0.1, "loss": 2.38466739654541, "step": 3076 }, { "epoch": 0.09771428571428571, "grad_norm": 0.1298828125, "learning_rate": 0.1, "loss": 2.371915102005005, "step": 3078 }, { "epoch": 0.09777777777777778, "grad_norm": 0.08740234375, "learning_rate": 0.1, "loss": 2.3703203201293945, "step": 3080 }, { "epoch": 0.09784126984126984, "grad_norm": 0.12451171875, "learning_rate": 0.1, "loss": 2.38539719581604, "step": 3082 }, { "epoch": 0.09790476190476191, "grad_norm": 0.15234375, "learning_rate": 0.1, "loss": 2.3492205142974854, "step": 3084 }, { "epoch": 0.09796825396825397, "grad_norm": 0.1337890625, "learning_rate": 0.1, "loss": 2.386573314666748, "step": 3086 }, { "epoch": 0.09803174603174603, "grad_norm": 0.4453125, "learning_rate": 0.1, "loss": 2.4102532863616943, "step": 3088 }, { "epoch": 0.0980952380952381, "grad_norm": 0.294921875, "learning_rate": 0.1, "loss": 2.379425525665283, "step": 3090 }, { "epoch": 0.09815873015873015, "grad_norm": 0.06396484375, "learning_rate": 0.1, "loss": 2.3573429584503174, "step": 3092 }, { "epoch": 0.09822222222222222, "grad_norm": 0.1181640625, "learning_rate": 0.1, "loss": 2.355159282684326, "step": 3094 }, { "epoch": 0.09828571428571428, "grad_norm": 0.1796875, "learning_rate": 0.1, "loss": 2.3854258060455322, "step": 3096 }, { "epoch": 0.09834920634920635, "grad_norm": 0.1328125, "learning_rate": 0.1, "loss": 2.3712375164031982, "step": 3098 }, { "epoch": 0.09841269841269841, "grad_norm": 0.1591796875, "learning_rate": 0.1, "loss": 2.3901398181915283, "step": 3100 }, { "epoch": 0.09847619047619048, "grad_norm": 0.357421875, "learning_rate": 0.1, "loss": 2.385497570037842, "step": 3102 }, { "epoch": 0.09853968253968254, "grad_norm": 0.326171875, "learning_rate": 0.1, "loss": 2.3499107360839844, "step": 3104 }, { "epoch": 0.09860317460317461, "grad_norm": 0.10400390625, "learning_rate": 0.1, "loss": 2.3500945568084717, "step": 3106 }, { "epoch": 0.09866666666666667, "grad_norm": 0.12890625, "learning_rate": 0.1, "loss": 2.378159284591675, "step": 3108 }, { "epoch": 0.09873015873015872, "grad_norm": 0.07080078125, "learning_rate": 0.1, "loss": 2.347813367843628, "step": 3110 }, { "epoch": 0.0987936507936508, "grad_norm": 0.07958984375, "learning_rate": 0.1, "loss": 2.352243661880493, "step": 3112 }, { "epoch": 0.09885714285714285, "grad_norm": 0.15625, "learning_rate": 0.1, "loss": 2.349508762359619, "step": 3114 }, { "epoch": 0.09892063492063492, "grad_norm": 0.5078125, "learning_rate": 0.1, "loss": 2.374912977218628, "step": 3116 }, { "epoch": 0.09898412698412698, "grad_norm": 0.265625, "learning_rate": 0.1, "loss": 2.363516330718994, "step": 3118 }, { "epoch": 0.09904761904761905, "grad_norm": 0.1455078125, "learning_rate": 0.1, "loss": 2.356422185897827, "step": 3120 }, { "epoch": 0.09911111111111111, "grad_norm": 0.10986328125, "learning_rate": 0.1, "loss": 2.348611831665039, "step": 3122 }, { "epoch": 0.09917460317460318, "grad_norm": 0.1357421875, "learning_rate": 0.1, "loss": 2.37180757522583, "step": 3124 }, { "epoch": 0.09923809523809524, "grad_norm": 0.1044921875, "learning_rate": 0.1, "loss": 2.358886241912842, "step": 3126 }, { "epoch": 0.09930158730158731, "grad_norm": 0.09130859375, "learning_rate": 0.1, "loss": 2.3564043045043945, "step": 3128 }, { "epoch": 0.09936507936507936, "grad_norm": 0.0791015625, "learning_rate": 0.1, "loss": 2.3647141456604004, "step": 3130 }, { "epoch": 0.09942857142857142, "grad_norm": 0.18359375, "learning_rate": 0.1, "loss": 2.376436233520508, "step": 3132 }, { "epoch": 0.09949206349206349, "grad_norm": 0.2353515625, "learning_rate": 0.1, "loss": 2.3661372661590576, "step": 3134 }, { "epoch": 0.09955555555555555, "grad_norm": 0.10498046875, "learning_rate": 0.1, "loss": 2.3752944469451904, "step": 3136 }, { "epoch": 0.09961904761904762, "grad_norm": 0.11279296875, "learning_rate": 0.1, "loss": 2.360243082046509, "step": 3138 }, { "epoch": 0.09968253968253968, "grad_norm": 0.125, "learning_rate": 0.1, "loss": 2.360025405883789, "step": 3140 }, { "epoch": 0.09974603174603175, "grad_norm": 0.1142578125, "learning_rate": 0.1, "loss": 2.341798782348633, "step": 3142 }, { "epoch": 0.0998095238095238, "grad_norm": 0.2041015625, "learning_rate": 0.1, "loss": 2.361220359802246, "step": 3144 }, { "epoch": 0.09987301587301588, "grad_norm": 0.349609375, "learning_rate": 0.1, "loss": 2.3767852783203125, "step": 3146 }, { "epoch": 0.09993650793650793, "grad_norm": 0.287109375, "learning_rate": 0.1, "loss": 2.3630876541137695, "step": 3148 }, { "epoch": 0.1, "grad_norm": 0.2041015625, "learning_rate": 0.1, "loss": 2.378488540649414, "step": 3150 }, { "epoch": 0.1, "eval_loss": 1.8865832090377808, "eval_runtime": 147.295, "eval_samples_per_second": 7.21, "eval_steps_per_second": 1.806, "step": 3150 }, { "epoch": 0.10006349206349206, "grad_norm": 0.2099609375, "learning_rate": 0.1, "loss": 2.3516483306884766, "step": 3152 }, { "epoch": 0.10012698412698413, "grad_norm": 0.1455078125, "learning_rate": 0.1, "loss": 2.342693567276001, "step": 3154 }, { "epoch": 0.10019047619047619, "grad_norm": 0.1171875, "learning_rate": 0.1, "loss": 2.34096622467041, "step": 3156 }, { "epoch": 0.10025396825396825, "grad_norm": 0.2080078125, "learning_rate": 0.1, "loss": 2.371737003326416, "step": 3158 }, { "epoch": 0.10031746031746032, "grad_norm": 0.302734375, "learning_rate": 0.1, "loss": 2.3790335655212402, "step": 3160 }, { "epoch": 0.10038095238095238, "grad_norm": 0.19921875, "learning_rate": 0.1, "loss": 2.332385778427124, "step": 3162 }, { "epoch": 0.10044444444444445, "grad_norm": 0.40625, "learning_rate": 0.1, "loss": 2.3646175861358643, "step": 3164 }, { "epoch": 0.1005079365079365, "grad_norm": 0.1904296875, "learning_rate": 0.1, "loss": 2.3695333003997803, "step": 3166 }, { "epoch": 0.10057142857142858, "grad_norm": 0.10546875, "learning_rate": 0.1, "loss": 2.3514139652252197, "step": 3168 }, { "epoch": 0.10063492063492063, "grad_norm": 0.10546875, "learning_rate": 0.1, "loss": 2.3347651958465576, "step": 3170 }, { "epoch": 0.1006984126984127, "grad_norm": 0.1279296875, "learning_rate": 0.1, "loss": 2.3630435466766357, "step": 3172 }, { "epoch": 0.10076190476190476, "grad_norm": 0.10205078125, "learning_rate": 0.1, "loss": 2.336247444152832, "step": 3174 }, { "epoch": 0.10082539682539683, "grad_norm": 0.177734375, "learning_rate": 0.1, "loss": 2.3396432399749756, "step": 3176 }, { "epoch": 0.10088888888888889, "grad_norm": 0.349609375, "learning_rate": 0.1, "loss": 2.380974769592285, "step": 3178 }, { "epoch": 0.10095238095238095, "grad_norm": 0.38671875, "learning_rate": 0.1, "loss": 2.337496519088745, "step": 3180 }, { "epoch": 0.10101587301587302, "grad_norm": 0.16796875, "learning_rate": 0.1, "loss": 2.3597891330718994, "step": 3182 }, { "epoch": 0.10107936507936507, "grad_norm": 0.1748046875, "learning_rate": 0.1, "loss": 2.3533506393432617, "step": 3184 }, { "epoch": 0.10114285714285715, "grad_norm": 0.1953125, "learning_rate": 0.1, "loss": 2.36279559135437, "step": 3186 }, { "epoch": 0.1012063492063492, "grad_norm": 0.1904296875, "learning_rate": 0.1, "loss": 2.3270177841186523, "step": 3188 }, { "epoch": 0.10126984126984127, "grad_norm": 0.1259765625, "learning_rate": 0.1, "loss": 2.335341691970825, "step": 3190 }, { "epoch": 0.10133333333333333, "grad_norm": 0.052001953125, "learning_rate": 0.1, "loss": 2.3372156620025635, "step": 3192 }, { "epoch": 0.1013968253968254, "grad_norm": 0.07177734375, "learning_rate": 0.1, "loss": 2.3587396144866943, "step": 3194 }, { "epoch": 0.10146031746031746, "grad_norm": 0.10107421875, "learning_rate": 0.1, "loss": 2.351576566696167, "step": 3196 }, { "epoch": 0.10152380952380953, "grad_norm": 0.0869140625, "learning_rate": 0.1, "loss": 2.3531692028045654, "step": 3198 }, { "epoch": 0.10158730158730159, "grad_norm": 0.1318359375, "learning_rate": 0.1, "loss": 2.3487253189086914, "step": 3200 }, { "epoch": 0.10165079365079364, "grad_norm": 0.419921875, "learning_rate": 0.1, "loss": 2.329202175140381, "step": 3202 }, { "epoch": 0.10171428571428572, "grad_norm": 0.306640625, "learning_rate": 0.1, "loss": 2.312441349029541, "step": 3204 }, { "epoch": 0.10177777777777777, "grad_norm": 0.13671875, "learning_rate": 0.1, "loss": 2.3496296405792236, "step": 3206 }, { "epoch": 0.10184126984126984, "grad_norm": 0.1064453125, "learning_rate": 0.1, "loss": 2.341704845428467, "step": 3208 }, { "epoch": 0.1019047619047619, "grad_norm": 0.08837890625, "learning_rate": 0.1, "loss": 2.334219217300415, "step": 3210 }, { "epoch": 0.10196825396825397, "grad_norm": 0.07666015625, "learning_rate": 0.1, "loss": 2.350252866744995, "step": 3212 }, { "epoch": 0.10203174603174603, "grad_norm": 0.05712890625, "learning_rate": 0.1, "loss": 2.336597204208374, "step": 3214 }, { "epoch": 0.1020952380952381, "grad_norm": 0.134765625, "learning_rate": 0.1, "loss": 2.3345773220062256, "step": 3216 }, { "epoch": 0.10215873015873016, "grad_norm": 0.5078125, "learning_rate": 0.1, "loss": 2.351301670074463, "step": 3218 }, { "epoch": 0.10222222222222223, "grad_norm": 0.2177734375, "learning_rate": 0.1, "loss": 2.3316478729248047, "step": 3220 }, { "epoch": 0.10228571428571429, "grad_norm": 0.146484375, "learning_rate": 0.1, "loss": 2.3379416465759277, "step": 3222 }, { "epoch": 0.10234920634920634, "grad_norm": 0.2177734375, "learning_rate": 0.1, "loss": 2.323058843612671, "step": 3224 }, { "epoch": 0.10241269841269841, "grad_norm": 0.267578125, "learning_rate": 0.1, "loss": 2.3386640548706055, "step": 3226 }, { "epoch": 0.10247619047619047, "grad_norm": 0.2177734375, "learning_rate": 0.1, "loss": 2.3268418312072754, "step": 3228 }, { "epoch": 0.10253968253968254, "grad_norm": 0.1044921875, "learning_rate": 0.1, "loss": 2.3212616443634033, "step": 3230 }, { "epoch": 0.1026031746031746, "grad_norm": 0.0869140625, "learning_rate": 0.1, "loss": 2.352118730545044, "step": 3232 }, { "epoch": 0.10266666666666667, "grad_norm": 0.140625, "learning_rate": 0.1, "loss": 2.3269803524017334, "step": 3234 }, { "epoch": 0.10273015873015873, "grad_norm": 0.162109375, "learning_rate": 0.1, "loss": 2.301157236099243, "step": 3236 }, { "epoch": 0.1027936507936508, "grad_norm": 0.0986328125, "learning_rate": 0.1, "loss": 2.318889856338501, "step": 3238 }, { "epoch": 0.10285714285714286, "grad_norm": 0.068359375, "learning_rate": 0.1, "loss": 2.336484909057617, "step": 3240 }, { "epoch": 0.10292063492063493, "grad_norm": 0.08251953125, "learning_rate": 0.1, "loss": 2.3263423442840576, "step": 3242 }, { "epoch": 0.10298412698412698, "grad_norm": 0.1015625, "learning_rate": 0.1, "loss": 2.3313236236572266, "step": 3244 }, { "epoch": 0.10304761904761905, "grad_norm": 0.1455078125, "learning_rate": 0.1, "loss": 2.348193407058716, "step": 3246 }, { "epoch": 0.10311111111111111, "grad_norm": 0.1142578125, "learning_rate": 0.1, "loss": 2.34021258354187, "step": 3248 }, { "epoch": 0.10317460317460317, "grad_norm": 0.21875, "learning_rate": 0.1, "loss": 2.3320717811584473, "step": 3250 }, { "epoch": 0.10323809523809524, "grad_norm": 0.494140625, "learning_rate": 0.1, "loss": 2.3512299060821533, "step": 3252 }, { "epoch": 0.1033015873015873, "grad_norm": 0.134765625, "learning_rate": 0.1, "loss": 2.303234815597534, "step": 3254 }, { "epoch": 0.10336507936507937, "grad_norm": 0.236328125, "learning_rate": 0.1, "loss": 2.3494513034820557, "step": 3256 }, { "epoch": 0.10342857142857143, "grad_norm": 0.19921875, "learning_rate": 0.1, "loss": 2.32682204246521, "step": 3258 }, { "epoch": 0.1034920634920635, "grad_norm": 0.357421875, "learning_rate": 0.1, "loss": 2.352147340774536, "step": 3260 }, { "epoch": 0.10355555555555555, "grad_norm": 0.283203125, "learning_rate": 0.1, "loss": 2.3208167552948, "step": 3262 }, { "epoch": 0.10361904761904762, "grad_norm": 0.2275390625, "learning_rate": 0.1, "loss": 2.344122886657715, "step": 3264 }, { "epoch": 0.10368253968253968, "grad_norm": 0.1943359375, "learning_rate": 0.1, "loss": 2.3330907821655273, "step": 3266 }, { "epoch": 0.10374603174603175, "grad_norm": 0.125, "learning_rate": 0.1, "loss": 2.3386008739471436, "step": 3268 }, { "epoch": 0.10380952380952381, "grad_norm": 0.14453125, "learning_rate": 0.1, "loss": 2.3361656665802, "step": 3270 }, { "epoch": 0.10387301587301587, "grad_norm": 0.111328125, "learning_rate": 0.1, "loss": 2.3612117767333984, "step": 3272 }, { "epoch": 0.10393650793650794, "grad_norm": 0.126953125, "learning_rate": 0.1, "loss": 2.349057197570801, "step": 3274 }, { "epoch": 0.104, "grad_norm": 0.0703125, "learning_rate": 0.1, "loss": 2.3414549827575684, "step": 3276 }, { "epoch": 0.10406349206349207, "grad_norm": 0.10791015625, "learning_rate": 0.1, "loss": 2.33007550239563, "step": 3278 }, { "epoch": 0.10412698412698412, "grad_norm": 0.265625, "learning_rate": 0.1, "loss": 2.3456130027770996, "step": 3280 }, { "epoch": 0.1041904761904762, "grad_norm": 0.255859375, "learning_rate": 0.1, "loss": 2.3427767753601074, "step": 3282 }, { "epoch": 0.10425396825396825, "grad_norm": 0.10693359375, "learning_rate": 0.1, "loss": 2.3201069831848145, "step": 3284 }, { "epoch": 0.10431746031746032, "grad_norm": 0.2578125, "learning_rate": 0.1, "loss": 2.356478691101074, "step": 3286 }, { "epoch": 0.10438095238095238, "grad_norm": 0.3125, "learning_rate": 0.1, "loss": 2.360605001449585, "step": 3288 }, { "epoch": 0.10444444444444445, "grad_norm": 0.181640625, "learning_rate": 0.1, "loss": 2.3378303050994873, "step": 3290 }, { "epoch": 0.10450793650793651, "grad_norm": 0.2265625, "learning_rate": 0.1, "loss": 2.342726945877075, "step": 3292 }, { "epoch": 0.10457142857142857, "grad_norm": 0.28515625, "learning_rate": 0.1, "loss": 2.3300466537475586, "step": 3294 }, { "epoch": 0.10463492063492064, "grad_norm": 0.2041015625, "learning_rate": 0.1, "loss": 2.3365747928619385, "step": 3296 }, { "epoch": 0.1046984126984127, "grad_norm": 0.169921875, "learning_rate": 0.1, "loss": 2.3608007431030273, "step": 3298 }, { "epoch": 0.10476190476190476, "grad_norm": 0.162109375, "learning_rate": 0.1, "loss": 2.3410987854003906, "step": 3300 }, { "epoch": 0.10482539682539682, "grad_norm": 0.1435546875, "learning_rate": 0.1, "loss": 2.315688133239746, "step": 3302 }, { "epoch": 0.10488888888888889, "grad_norm": 0.1142578125, "learning_rate": 0.1, "loss": 2.352668285369873, "step": 3304 }, { "epoch": 0.10495238095238095, "grad_norm": 0.1318359375, "learning_rate": 0.1, "loss": 2.3279716968536377, "step": 3306 }, { "epoch": 0.10501587301587302, "grad_norm": 0.26171875, "learning_rate": 0.1, "loss": 2.3384768962860107, "step": 3308 }, { "epoch": 0.10507936507936508, "grad_norm": 0.17578125, "learning_rate": 0.1, "loss": 2.3501646518707275, "step": 3310 }, { "epoch": 0.10514285714285715, "grad_norm": 0.052490234375, "learning_rate": 0.1, "loss": 2.3263134956359863, "step": 3312 }, { "epoch": 0.1052063492063492, "grad_norm": 0.1455078125, "learning_rate": 0.1, "loss": 2.3680107593536377, "step": 3314 }, { "epoch": 0.10526984126984126, "grad_norm": 0.138671875, "learning_rate": 0.1, "loss": 2.3318769931793213, "step": 3316 }, { "epoch": 0.10533333333333333, "grad_norm": 0.1865234375, "learning_rate": 0.1, "loss": 2.351991891860962, "step": 3318 }, { "epoch": 0.10539682539682539, "grad_norm": 0.47265625, "learning_rate": 0.1, "loss": 2.3614602088928223, "step": 3320 }, { "epoch": 0.10546031746031746, "grad_norm": 0.26953125, "learning_rate": 0.1, "loss": 2.3518800735473633, "step": 3322 }, { "epoch": 0.10552380952380952, "grad_norm": 0.1494140625, "learning_rate": 0.1, "loss": 2.353815793991089, "step": 3324 }, { "epoch": 0.10558730158730159, "grad_norm": 0.259765625, "learning_rate": 0.1, "loss": 2.362107276916504, "step": 3326 }, { "epoch": 0.10565079365079365, "grad_norm": 0.404296875, "learning_rate": 0.1, "loss": 2.3262107372283936, "step": 3328 }, { "epoch": 0.10571428571428572, "grad_norm": 0.08642578125, "learning_rate": 0.1, "loss": 2.338365077972412, "step": 3330 }, { "epoch": 0.10577777777777778, "grad_norm": 0.1552734375, "learning_rate": 0.1, "loss": 2.366158962249756, "step": 3332 }, { "epoch": 0.10584126984126985, "grad_norm": 0.185546875, "learning_rate": 0.1, "loss": 2.329056978225708, "step": 3334 }, { "epoch": 0.1059047619047619, "grad_norm": 0.0634765625, "learning_rate": 0.1, "loss": 2.304471254348755, "step": 3336 }, { "epoch": 0.10596825396825396, "grad_norm": 0.09228515625, "learning_rate": 0.1, "loss": 2.342478036880493, "step": 3338 }, { "epoch": 0.10603174603174603, "grad_norm": 0.09912109375, "learning_rate": 0.1, "loss": 2.379951238632202, "step": 3340 }, { "epoch": 0.10609523809523809, "grad_norm": 0.11572265625, "learning_rate": 0.1, "loss": 2.339369535446167, "step": 3342 }, { "epoch": 0.10615873015873016, "grad_norm": 0.08251953125, "learning_rate": 0.1, "loss": 2.2961819171905518, "step": 3344 }, { "epoch": 0.10622222222222222, "grad_norm": 0.171875, "learning_rate": 0.1, "loss": 2.314613103866577, "step": 3346 }, { "epoch": 0.10628571428571429, "grad_norm": 0.177734375, "learning_rate": 0.1, "loss": 2.322188138961792, "step": 3348 }, { "epoch": 0.10634920634920635, "grad_norm": 0.12158203125, "learning_rate": 0.1, "loss": 2.3588831424713135, "step": 3350 }, { "epoch": 0.10641269841269842, "grad_norm": 0.1689453125, "learning_rate": 0.1, "loss": 2.3354945182800293, "step": 3352 }, { "epoch": 0.10647619047619047, "grad_norm": 0.109375, "learning_rate": 0.1, "loss": 2.350928783416748, "step": 3354 }, { "epoch": 0.10653968253968255, "grad_norm": 0.2177734375, "learning_rate": 0.1, "loss": 2.348881721496582, "step": 3356 }, { "epoch": 0.1066031746031746, "grad_norm": 0.1435546875, "learning_rate": 0.1, "loss": 2.3308064937591553, "step": 3358 }, { "epoch": 0.10666666666666667, "grad_norm": 0.36328125, "learning_rate": 0.1, "loss": 2.3493831157684326, "step": 3360 }, { "epoch": 0.10673015873015873, "grad_norm": 0.142578125, "learning_rate": 0.1, "loss": 2.328228712081909, "step": 3362 }, { "epoch": 0.10679365079365079, "grad_norm": 0.2890625, "learning_rate": 0.1, "loss": 2.3473575115203857, "step": 3364 }, { "epoch": 0.10685714285714286, "grad_norm": 0.2578125, "learning_rate": 0.1, "loss": 2.373417615890503, "step": 3366 }, { "epoch": 0.10692063492063492, "grad_norm": 0.057861328125, "learning_rate": 0.1, "loss": 2.3528759479522705, "step": 3368 }, { "epoch": 0.10698412698412699, "grad_norm": 0.1044921875, "learning_rate": 0.1, "loss": 2.3705062866210938, "step": 3370 }, { "epoch": 0.10704761904761904, "grad_norm": 0.2265625, "learning_rate": 0.1, "loss": 2.362781286239624, "step": 3372 }, { "epoch": 0.10711111111111112, "grad_norm": 0.2392578125, "learning_rate": 0.1, "loss": 2.3496038913726807, "step": 3374 }, { "epoch": 0.10717460317460317, "grad_norm": 0.09033203125, "learning_rate": 0.1, "loss": 2.3403730392456055, "step": 3376 }, { "epoch": 0.10723809523809524, "grad_norm": 0.109375, "learning_rate": 0.1, "loss": 2.3697726726531982, "step": 3378 }, { "epoch": 0.1073015873015873, "grad_norm": 0.314453125, "learning_rate": 0.1, "loss": 2.3709375858306885, "step": 3380 }, { "epoch": 0.10736507936507937, "grad_norm": 0.408203125, "learning_rate": 0.1, "loss": 2.350872755050659, "step": 3382 }, { "epoch": 0.10742857142857143, "grad_norm": 0.12255859375, "learning_rate": 0.1, "loss": 2.3498642444610596, "step": 3384 }, { "epoch": 0.10749206349206349, "grad_norm": 0.07080078125, "learning_rate": 0.1, "loss": 2.3652641773223877, "step": 3386 }, { "epoch": 0.10755555555555556, "grad_norm": 0.1884765625, "learning_rate": 0.1, "loss": 2.3551740646362305, "step": 3388 }, { "epoch": 0.10761904761904761, "grad_norm": 0.1953125, "learning_rate": 0.1, "loss": 2.3340487480163574, "step": 3390 }, { "epoch": 0.10768253968253969, "grad_norm": 0.1220703125, "learning_rate": 0.1, "loss": 2.348574638366699, "step": 3392 }, { "epoch": 0.10774603174603174, "grad_norm": 0.1513671875, "learning_rate": 0.1, "loss": 2.3704099655151367, "step": 3394 }, { "epoch": 0.10780952380952381, "grad_norm": 0.150390625, "learning_rate": 0.1, "loss": 2.3355631828308105, "step": 3396 }, { "epoch": 0.10787301587301587, "grad_norm": 0.08837890625, "learning_rate": 0.1, "loss": 2.3621702194213867, "step": 3398 }, { "epoch": 0.10793650793650794, "grad_norm": 0.0849609375, "learning_rate": 0.1, "loss": 2.369783878326416, "step": 3400 }, { "epoch": 0.108, "grad_norm": 0.2275390625, "learning_rate": 0.1, "loss": 2.3476386070251465, "step": 3402 }, { "epoch": 0.10806349206349207, "grad_norm": 0.375, "learning_rate": 0.1, "loss": 2.3561904430389404, "step": 3404 }, { "epoch": 0.10812698412698413, "grad_norm": 0.27734375, "learning_rate": 0.1, "loss": 2.3483197689056396, "step": 3406 }, { "epoch": 0.10819047619047618, "grad_norm": 0.11865234375, "learning_rate": 0.1, "loss": 2.3494465351104736, "step": 3408 }, { "epoch": 0.10825396825396826, "grad_norm": 0.1474609375, "learning_rate": 0.1, "loss": 2.3924953937530518, "step": 3410 }, { "epoch": 0.10831746031746031, "grad_norm": 0.2333984375, "learning_rate": 0.1, "loss": 2.372633695602417, "step": 3412 }, { "epoch": 0.10838095238095238, "grad_norm": 0.2734375, "learning_rate": 0.1, "loss": 2.3445582389831543, "step": 3414 }, { "epoch": 0.10844444444444444, "grad_norm": 0.24609375, "learning_rate": 0.1, "loss": 2.3596572875976562, "step": 3416 }, { "epoch": 0.10850793650793651, "grad_norm": 0.142578125, "learning_rate": 0.1, "loss": 2.340954065322876, "step": 3418 }, { "epoch": 0.10857142857142857, "grad_norm": 0.0830078125, "learning_rate": 0.1, "loss": 2.3584368228912354, "step": 3420 }, { "epoch": 0.10863492063492064, "grad_norm": 0.109375, "learning_rate": 0.1, "loss": 2.3443057537078857, "step": 3422 }, { "epoch": 0.1086984126984127, "grad_norm": 0.11962890625, "learning_rate": 0.1, "loss": 2.361745834350586, "step": 3424 }, { "epoch": 0.10876190476190477, "grad_norm": 0.16796875, "learning_rate": 0.1, "loss": 2.370006561279297, "step": 3426 }, { "epoch": 0.10882539682539683, "grad_norm": 0.16796875, "learning_rate": 0.1, "loss": 2.3342065811157227, "step": 3428 }, { "epoch": 0.10888888888888888, "grad_norm": 0.2216796875, "learning_rate": 0.1, "loss": 2.345484495162964, "step": 3430 }, { "epoch": 0.10895238095238095, "grad_norm": 0.408203125, "learning_rate": 0.1, "loss": 2.3771321773529053, "step": 3432 }, { "epoch": 0.10901587301587301, "grad_norm": 0.43359375, "learning_rate": 0.1, "loss": 2.3620004653930664, "step": 3434 }, { "epoch": 0.10907936507936508, "grad_norm": 0.35546875, "learning_rate": 0.1, "loss": 2.372932195663452, "step": 3436 }, { "epoch": 0.10914285714285714, "grad_norm": 0.1025390625, "learning_rate": 0.1, "loss": 2.3571603298187256, "step": 3438 }, { "epoch": 0.10920634920634921, "grad_norm": 0.07861328125, "learning_rate": 0.1, "loss": 2.3662962913513184, "step": 3440 }, { "epoch": 0.10926984126984127, "grad_norm": 0.07568359375, "learning_rate": 0.1, "loss": 2.3586056232452393, "step": 3442 }, { "epoch": 0.10933333333333334, "grad_norm": 0.130859375, "learning_rate": 0.1, "loss": 2.3372647762298584, "step": 3444 }, { "epoch": 0.1093968253968254, "grad_norm": 0.1591796875, "learning_rate": 0.1, "loss": 2.347508430480957, "step": 3446 }, { "epoch": 0.10946031746031747, "grad_norm": 0.1904296875, "learning_rate": 0.1, "loss": 2.339944362640381, "step": 3448 }, { "epoch": 0.10952380952380952, "grad_norm": 0.279296875, "learning_rate": 0.1, "loss": 2.349416494369507, "step": 3450 }, { "epoch": 0.10958730158730158, "grad_norm": 0.185546875, "learning_rate": 0.1, "loss": 2.3513553142547607, "step": 3452 }, { "epoch": 0.10965079365079365, "grad_norm": 0.1376953125, "learning_rate": 0.1, "loss": 2.375195026397705, "step": 3454 }, { "epoch": 0.10971428571428571, "grad_norm": 0.1845703125, "learning_rate": 0.1, "loss": 2.3614697456359863, "step": 3456 }, { "epoch": 0.10977777777777778, "grad_norm": 0.185546875, "learning_rate": 0.1, "loss": 2.365938186645508, "step": 3458 }, { "epoch": 0.10984126984126984, "grad_norm": 0.134765625, "learning_rate": 0.1, "loss": 2.353409767150879, "step": 3460 }, { "epoch": 0.10990476190476191, "grad_norm": 0.3046875, "learning_rate": 0.1, "loss": 2.3247878551483154, "step": 3462 }, { "epoch": 0.10996825396825397, "grad_norm": 0.255859375, "learning_rate": 0.1, "loss": 2.3790037631988525, "step": 3464 }, { "epoch": 0.11003174603174604, "grad_norm": 0.10009765625, "learning_rate": 0.1, "loss": 2.368886709213257, "step": 3466 }, { "epoch": 0.1100952380952381, "grad_norm": 0.1181640625, "learning_rate": 0.1, "loss": 2.3807566165924072, "step": 3468 }, { "epoch": 0.11015873015873016, "grad_norm": 0.26953125, "learning_rate": 0.1, "loss": 2.3617265224456787, "step": 3470 }, { "epoch": 0.11022222222222222, "grad_norm": 0.36328125, "learning_rate": 0.1, "loss": 2.3667521476745605, "step": 3472 }, { "epoch": 0.11028571428571429, "grad_norm": 0.119140625, "learning_rate": 0.1, "loss": 2.3589887619018555, "step": 3474 }, { "epoch": 0.11034920634920635, "grad_norm": 0.123046875, "learning_rate": 0.1, "loss": 2.3429319858551025, "step": 3476 }, { "epoch": 0.1104126984126984, "grad_norm": 0.13671875, "learning_rate": 0.1, "loss": 2.338932514190674, "step": 3478 }, { "epoch": 0.11047619047619048, "grad_norm": 0.1298828125, "learning_rate": 0.1, "loss": 2.355290651321411, "step": 3480 }, { "epoch": 0.11053968253968253, "grad_norm": 0.07177734375, "learning_rate": 0.1, "loss": 2.3592286109924316, "step": 3482 }, { "epoch": 0.1106031746031746, "grad_norm": 0.056884765625, "learning_rate": 0.1, "loss": 2.3416621685028076, "step": 3484 }, { "epoch": 0.11066666666666666, "grad_norm": 0.06689453125, "learning_rate": 0.1, "loss": 2.3354456424713135, "step": 3486 }, { "epoch": 0.11073015873015873, "grad_norm": 0.10595703125, "learning_rate": 0.1, "loss": 2.345093250274658, "step": 3488 }, { "epoch": 0.11079365079365079, "grad_norm": 0.130859375, "learning_rate": 0.1, "loss": 2.346799850463867, "step": 3490 }, { "epoch": 0.11085714285714286, "grad_norm": 0.1611328125, "learning_rate": 0.1, "loss": 2.3764498233795166, "step": 3492 }, { "epoch": 0.11092063492063492, "grad_norm": 0.26171875, "learning_rate": 0.1, "loss": 2.3612849712371826, "step": 3494 }, { "epoch": 0.11098412698412699, "grad_norm": 0.34375, "learning_rate": 0.1, "loss": 2.3757095336914062, "step": 3496 }, { "epoch": 0.11104761904761905, "grad_norm": 0.07177734375, "learning_rate": 0.1, "loss": 2.3334944248199463, "step": 3498 }, { "epoch": 0.1111111111111111, "grad_norm": 0.447265625, "learning_rate": 0.1, "loss": 2.375708818435669, "step": 3500 }, { "epoch": 0.11117460317460318, "grad_norm": 0.3515625, "learning_rate": 0.1, "loss": 2.368227958679199, "step": 3502 }, { "epoch": 0.11123809523809523, "grad_norm": 0.1796875, "learning_rate": 0.1, "loss": 2.37325382232666, "step": 3504 }, { "epoch": 0.1113015873015873, "grad_norm": 0.10498046875, "learning_rate": 0.1, "loss": 2.34584641456604, "step": 3506 }, { "epoch": 0.11136507936507936, "grad_norm": 0.064453125, "learning_rate": 0.1, "loss": 2.369048833847046, "step": 3508 }, { "epoch": 0.11142857142857143, "grad_norm": 0.095703125, "learning_rate": 0.1, "loss": 2.3563108444213867, "step": 3510 }, { "epoch": 0.11149206349206349, "grad_norm": 0.12451171875, "learning_rate": 0.1, "loss": 2.369086742401123, "step": 3512 }, { "epoch": 0.11155555555555556, "grad_norm": 0.19140625, "learning_rate": 0.1, "loss": 2.3520584106445312, "step": 3514 }, { "epoch": 0.11161904761904762, "grad_norm": 0.40234375, "learning_rate": 0.1, "loss": 2.390497922897339, "step": 3516 }, { "epoch": 0.11168253968253969, "grad_norm": 0.1630859375, "learning_rate": 0.1, "loss": 2.362485408782959, "step": 3518 }, { "epoch": 0.11174603174603175, "grad_norm": 0.1533203125, "learning_rate": 0.1, "loss": 2.3851490020751953, "step": 3520 }, { "epoch": 0.1118095238095238, "grad_norm": 0.154296875, "learning_rate": 0.1, "loss": 2.3497154712677, "step": 3522 }, { "epoch": 0.11187301587301587, "grad_norm": 0.060791015625, "learning_rate": 0.1, "loss": 2.373983860015869, "step": 3524 }, { "epoch": 0.11193650793650793, "grad_norm": 0.08349609375, "learning_rate": 0.1, "loss": 2.355860948562622, "step": 3526 }, { "epoch": 0.112, "grad_norm": 0.25390625, "learning_rate": 0.1, "loss": 2.3451035022735596, "step": 3528 }, { "epoch": 0.11206349206349206, "grad_norm": 0.275390625, "learning_rate": 0.1, "loss": 2.352815866470337, "step": 3530 }, { "epoch": 0.11212698412698413, "grad_norm": 0.1162109375, "learning_rate": 0.1, "loss": 2.3629519939422607, "step": 3532 }, { "epoch": 0.11219047619047619, "grad_norm": 0.201171875, "learning_rate": 0.1, "loss": 2.346721887588501, "step": 3534 }, { "epoch": 0.11225396825396826, "grad_norm": 0.162109375, "learning_rate": 0.1, "loss": 2.36957049369812, "step": 3536 }, { "epoch": 0.11231746031746032, "grad_norm": 0.2080078125, "learning_rate": 0.1, "loss": 2.351757287979126, "step": 3538 }, { "epoch": 0.11238095238095239, "grad_norm": 0.0966796875, "learning_rate": 0.1, "loss": 2.3268871307373047, "step": 3540 }, { "epoch": 0.11244444444444444, "grad_norm": 0.287109375, "learning_rate": 0.1, "loss": 2.3664298057556152, "step": 3542 }, { "epoch": 0.1125079365079365, "grad_norm": 0.431640625, "learning_rate": 0.1, "loss": 2.3721230030059814, "step": 3544 }, { "epoch": 0.11257142857142857, "grad_norm": 0.0751953125, "learning_rate": 0.1, "loss": 2.362917184829712, "step": 3546 }, { "epoch": 0.11263492063492063, "grad_norm": 0.08056640625, "learning_rate": 0.1, "loss": 2.371340274810791, "step": 3548 }, { "epoch": 0.1126984126984127, "grad_norm": 0.15625, "learning_rate": 0.1, "loss": 2.327317953109741, "step": 3550 }, { "epoch": 0.11276190476190476, "grad_norm": 0.10888671875, "learning_rate": 0.1, "loss": 2.3503735065460205, "step": 3552 }, { "epoch": 0.11282539682539683, "grad_norm": 0.07568359375, "learning_rate": 0.1, "loss": 2.377936840057373, "step": 3554 }, { "epoch": 0.11288888888888889, "grad_norm": 0.0888671875, "learning_rate": 0.1, "loss": 2.3500468730926514, "step": 3556 }, { "epoch": 0.11295238095238096, "grad_norm": 0.345703125, "learning_rate": 0.1, "loss": 2.3601861000061035, "step": 3558 }, { "epoch": 0.11301587301587301, "grad_norm": 0.291015625, "learning_rate": 0.1, "loss": 2.351044178009033, "step": 3560 }, { "epoch": 0.11307936507936509, "grad_norm": 0.1123046875, "learning_rate": 0.1, "loss": 2.3545966148376465, "step": 3562 }, { "epoch": 0.11314285714285714, "grad_norm": 0.193359375, "learning_rate": 0.1, "loss": 2.3484346866607666, "step": 3564 }, { "epoch": 0.11320634920634921, "grad_norm": 0.166015625, "learning_rate": 0.1, "loss": 2.370769500732422, "step": 3566 }, { "epoch": 0.11326984126984127, "grad_norm": 0.11474609375, "learning_rate": 0.1, "loss": 2.336609363555908, "step": 3568 }, { "epoch": 0.11333333333333333, "grad_norm": 0.11669921875, "learning_rate": 0.1, "loss": 2.35955810546875, "step": 3570 }, { "epoch": 0.1133968253968254, "grad_norm": 0.328125, "learning_rate": 0.1, "loss": 2.366596221923828, "step": 3572 }, { "epoch": 0.11346031746031746, "grad_norm": 0.1044921875, "learning_rate": 0.1, "loss": 2.341592788696289, "step": 3574 }, { "epoch": 0.11352380952380953, "grad_norm": 0.07421875, "learning_rate": 0.1, "loss": 2.3348073959350586, "step": 3576 }, { "epoch": 0.11358730158730158, "grad_norm": 0.0908203125, "learning_rate": 0.1, "loss": 2.329871892929077, "step": 3578 }, { "epoch": 0.11365079365079366, "grad_norm": 0.12353515625, "learning_rate": 0.1, "loss": 2.392683267593384, "step": 3580 }, { "epoch": 0.11371428571428571, "grad_norm": 0.1494140625, "learning_rate": 0.1, "loss": 2.3424203395843506, "step": 3582 }, { "epoch": 0.11377777777777778, "grad_norm": 0.06787109375, "learning_rate": 0.1, "loss": 2.3983778953552246, "step": 3584 }, { "epoch": 0.11384126984126984, "grad_norm": 0.1298828125, "learning_rate": 0.1, "loss": 2.3465158939361572, "step": 3586 }, { "epoch": 0.11390476190476191, "grad_norm": 0.515625, "learning_rate": 0.1, "loss": 2.368788957595825, "step": 3588 }, { "epoch": 0.11396825396825397, "grad_norm": 0.3359375, "learning_rate": 0.1, "loss": 2.3493800163269043, "step": 3590 }, { "epoch": 0.11403174603174603, "grad_norm": 0.1484375, "learning_rate": 0.1, "loss": 2.3818132877349854, "step": 3592 }, { "epoch": 0.1140952380952381, "grad_norm": 0.263671875, "learning_rate": 0.1, "loss": 2.3634862899780273, "step": 3594 }, { "epoch": 0.11415873015873015, "grad_norm": 0.08447265625, "learning_rate": 0.1, "loss": 2.3569865226745605, "step": 3596 }, { "epoch": 0.11422222222222222, "grad_norm": 0.2265625, "learning_rate": 0.1, "loss": 2.3751280307769775, "step": 3598 }, { "epoch": 0.11428571428571428, "grad_norm": 0.45703125, "learning_rate": 0.1, "loss": 2.3810863494873047, "step": 3600 }, { "epoch": 0.11434920634920635, "grad_norm": 0.06787109375, "learning_rate": 0.1, "loss": 2.3514742851257324, "step": 3602 }, { "epoch": 0.11441269841269841, "grad_norm": 0.09423828125, "learning_rate": 0.1, "loss": 2.3743557929992676, "step": 3604 }, { "epoch": 0.11447619047619048, "grad_norm": 0.140625, "learning_rate": 0.1, "loss": 2.384312868118286, "step": 3606 }, { "epoch": 0.11453968253968254, "grad_norm": 0.1318359375, "learning_rate": 0.1, "loss": 2.348015308380127, "step": 3608 }, { "epoch": 0.11460317460317461, "grad_norm": 0.0546875, "learning_rate": 0.1, "loss": 2.3827500343322754, "step": 3610 }, { "epoch": 0.11466666666666667, "grad_norm": 0.0390625, "learning_rate": 0.1, "loss": 2.377150774002075, "step": 3612 }, { "epoch": 0.11473015873015872, "grad_norm": 0.08056640625, "learning_rate": 0.1, "loss": 2.380643844604492, "step": 3614 }, { "epoch": 0.1147936507936508, "grad_norm": 0.07666015625, "learning_rate": 0.1, "loss": 2.3838653564453125, "step": 3616 }, { "epoch": 0.11485714285714285, "grad_norm": 0.09521484375, "learning_rate": 0.1, "loss": 2.3862435817718506, "step": 3618 }, { "epoch": 0.11492063492063492, "grad_norm": 0.1591796875, "learning_rate": 0.1, "loss": 2.4003841876983643, "step": 3620 }, { "epoch": 0.11498412698412698, "grad_norm": 0.2158203125, "learning_rate": 0.1, "loss": 2.3951573371887207, "step": 3622 }, { "epoch": 0.11504761904761905, "grad_norm": 0.119140625, "learning_rate": 0.1, "loss": 2.3978824615478516, "step": 3624 }, { "epoch": 0.11511111111111111, "grad_norm": 0.2333984375, "learning_rate": 0.1, "loss": 2.3834142684936523, "step": 3626 }, { "epoch": 0.11517460317460318, "grad_norm": 0.44921875, "learning_rate": 0.1, "loss": 2.3908584117889404, "step": 3628 }, { "epoch": 0.11523809523809524, "grad_norm": 0.220703125, "learning_rate": 0.1, "loss": 2.383060932159424, "step": 3630 }, { "epoch": 0.11530158730158731, "grad_norm": 0.10888671875, "learning_rate": 0.1, "loss": 2.381296396255493, "step": 3632 }, { "epoch": 0.11536507936507936, "grad_norm": 0.3984375, "learning_rate": 0.1, "loss": 2.3893206119537354, "step": 3634 }, { "epoch": 0.11542857142857142, "grad_norm": 0.09130859375, "learning_rate": 0.1, "loss": 2.3399622440338135, "step": 3636 }, { "epoch": 0.1154920634920635, "grad_norm": 0.251953125, "learning_rate": 0.1, "loss": 2.375021457672119, "step": 3638 }, { "epoch": 0.11555555555555555, "grad_norm": 0.283203125, "learning_rate": 0.1, "loss": 2.369903564453125, "step": 3640 }, { "epoch": 0.11561904761904762, "grad_norm": 0.17578125, "learning_rate": 0.1, "loss": 2.376779794692993, "step": 3642 }, { "epoch": 0.11568253968253968, "grad_norm": 0.09619140625, "learning_rate": 0.1, "loss": 2.3650901317596436, "step": 3644 }, { "epoch": 0.11574603174603175, "grad_norm": 0.12890625, "learning_rate": 0.1, "loss": 2.3647046089172363, "step": 3646 }, { "epoch": 0.1158095238095238, "grad_norm": 0.1484375, "learning_rate": 0.1, "loss": 2.379714250564575, "step": 3648 }, { "epoch": 0.11587301587301588, "grad_norm": 0.0732421875, "learning_rate": 0.1, "loss": 2.345123529434204, "step": 3650 }, { "epoch": 0.11593650793650793, "grad_norm": 0.154296875, "learning_rate": 0.1, "loss": 2.367623805999756, "step": 3652 }, { "epoch": 0.116, "grad_norm": 0.1181640625, "learning_rate": 0.1, "loss": 2.3478593826293945, "step": 3654 }, { "epoch": 0.11606349206349206, "grad_norm": 0.1455078125, "learning_rate": 0.1, "loss": 2.3832061290740967, "step": 3656 }, { "epoch": 0.11612698412698412, "grad_norm": 0.123046875, "learning_rate": 0.1, "loss": 2.371188163757324, "step": 3658 }, { "epoch": 0.11619047619047619, "grad_norm": 0.1748046875, "learning_rate": 0.1, "loss": 2.370065212249756, "step": 3660 }, { "epoch": 0.11625396825396825, "grad_norm": 0.37890625, "learning_rate": 0.1, "loss": 2.3772034645080566, "step": 3662 }, { "epoch": 0.11631746031746032, "grad_norm": 0.26953125, "learning_rate": 0.1, "loss": 2.370811700820923, "step": 3664 }, { "epoch": 0.11638095238095238, "grad_norm": 0.052978515625, "learning_rate": 0.1, "loss": 2.395909309387207, "step": 3666 }, { "epoch": 0.11644444444444445, "grad_norm": 0.0654296875, "learning_rate": 0.1, "loss": 2.3376917839050293, "step": 3668 }, { "epoch": 0.1165079365079365, "grad_norm": 0.19140625, "learning_rate": 0.1, "loss": 2.391631841659546, "step": 3670 }, { "epoch": 0.11657142857142858, "grad_norm": 0.244140625, "learning_rate": 0.1, "loss": 2.366126537322998, "step": 3672 }, { "epoch": 0.11663492063492063, "grad_norm": 0.166015625, "learning_rate": 0.1, "loss": 2.3703672885894775, "step": 3674 }, { "epoch": 0.1166984126984127, "grad_norm": 0.0791015625, "learning_rate": 0.1, "loss": 2.3635590076446533, "step": 3676 }, { "epoch": 0.11676190476190476, "grad_norm": 0.09619140625, "learning_rate": 0.1, "loss": 2.3768723011016846, "step": 3678 }, { "epoch": 0.11682539682539683, "grad_norm": 0.1474609375, "learning_rate": 0.1, "loss": 2.339832067489624, "step": 3680 }, { "epoch": 0.11688888888888889, "grad_norm": 0.2119140625, "learning_rate": 0.1, "loss": 2.3715245723724365, "step": 3682 }, { "epoch": 0.11695238095238095, "grad_norm": 0.287109375, "learning_rate": 0.1, "loss": 2.3656363487243652, "step": 3684 }, { "epoch": 0.11701587301587302, "grad_norm": 0.158203125, "learning_rate": 0.1, "loss": 2.37178635597229, "step": 3686 }, { "epoch": 0.11707936507936507, "grad_norm": 0.357421875, "learning_rate": 0.1, "loss": 2.3885228633880615, "step": 3688 }, { "epoch": 0.11714285714285715, "grad_norm": 0.3203125, "learning_rate": 0.1, "loss": 2.3601455688476562, "step": 3690 }, { "epoch": 0.1172063492063492, "grad_norm": 0.099609375, "learning_rate": 0.1, "loss": 2.351393222808838, "step": 3692 }, { "epoch": 0.11726984126984127, "grad_norm": 0.302734375, "learning_rate": 0.1, "loss": 2.380105972290039, "step": 3694 }, { "epoch": 0.11733333333333333, "grad_norm": 0.1767578125, "learning_rate": 0.1, "loss": 2.3565802574157715, "step": 3696 }, { "epoch": 0.1173968253968254, "grad_norm": 0.11474609375, "learning_rate": 0.1, "loss": 2.379729747772217, "step": 3698 }, { "epoch": 0.11746031746031746, "grad_norm": 0.1103515625, "learning_rate": 0.1, "loss": 2.346740484237671, "step": 3700 }, { "epoch": 0.11752380952380953, "grad_norm": 0.11474609375, "learning_rate": 0.1, "loss": 2.3742175102233887, "step": 3702 }, { "epoch": 0.11758730158730159, "grad_norm": 0.09716796875, "learning_rate": 0.1, "loss": 2.3353805541992188, "step": 3704 }, { "epoch": 0.11765079365079364, "grad_norm": 0.07421875, "learning_rate": 0.1, "loss": 2.352834463119507, "step": 3706 }, { "epoch": 0.11771428571428572, "grad_norm": 0.1396484375, "learning_rate": 0.1, "loss": 2.3594717979431152, "step": 3708 }, { "epoch": 0.11777777777777777, "grad_norm": 0.1796875, "learning_rate": 0.1, "loss": 2.346648693084717, "step": 3710 }, { "epoch": 0.11784126984126984, "grad_norm": 0.275390625, "learning_rate": 0.1, "loss": 2.347315788269043, "step": 3712 }, { "epoch": 0.1179047619047619, "grad_norm": 0.2197265625, "learning_rate": 0.1, "loss": 2.3911542892456055, "step": 3714 }, { "epoch": 0.11796825396825397, "grad_norm": 0.056884765625, "learning_rate": 0.1, "loss": 2.3760995864868164, "step": 3716 }, { "epoch": 0.11803174603174603, "grad_norm": 0.31640625, "learning_rate": 0.1, "loss": 2.3653604984283447, "step": 3718 }, { "epoch": 0.1180952380952381, "grad_norm": 0.4921875, "learning_rate": 0.1, "loss": 2.365247964859009, "step": 3720 }, { "epoch": 0.11815873015873016, "grad_norm": 0.1982421875, "learning_rate": 0.1, "loss": 2.3306002616882324, "step": 3722 }, { "epoch": 0.11822222222222223, "grad_norm": 0.0634765625, "learning_rate": 0.1, "loss": 2.3638482093811035, "step": 3724 }, { "epoch": 0.11828571428571429, "grad_norm": 0.072265625, "learning_rate": 0.1, "loss": 2.3562734127044678, "step": 3726 }, { "epoch": 0.11834920634920634, "grad_norm": 0.2138671875, "learning_rate": 0.1, "loss": 2.3544135093688965, "step": 3728 }, { "epoch": 0.11841269841269841, "grad_norm": 0.314453125, "learning_rate": 0.1, "loss": 2.3340649604797363, "step": 3730 }, { "epoch": 0.11847619047619047, "grad_norm": 0.1669921875, "learning_rate": 0.1, "loss": 2.382652521133423, "step": 3732 }, { "epoch": 0.11853968253968254, "grad_norm": 0.08837890625, "learning_rate": 0.1, "loss": 2.3551390171051025, "step": 3734 }, { "epoch": 0.1186031746031746, "grad_norm": 0.09716796875, "learning_rate": 0.1, "loss": 2.3323471546173096, "step": 3736 }, { "epoch": 0.11866666666666667, "grad_norm": 0.126953125, "learning_rate": 0.1, "loss": 2.356943130493164, "step": 3738 }, { "epoch": 0.11873015873015873, "grad_norm": 0.16015625, "learning_rate": 0.1, "loss": 2.333209753036499, "step": 3740 }, { "epoch": 0.1187936507936508, "grad_norm": 0.12060546875, "learning_rate": 0.1, "loss": 2.3609836101531982, "step": 3742 }, { "epoch": 0.11885714285714286, "grad_norm": 0.201171875, "learning_rate": 0.1, "loss": 2.3512656688690186, "step": 3744 }, { "epoch": 0.11892063492063493, "grad_norm": 0.35546875, "learning_rate": 0.1, "loss": 2.3496434688568115, "step": 3746 }, { "epoch": 0.11898412698412698, "grad_norm": 0.2333984375, "learning_rate": 0.1, "loss": 2.3704543113708496, "step": 3748 }, { "epoch": 0.11904761904761904, "grad_norm": 0.07958984375, "learning_rate": 0.1, "loss": 2.3427162170410156, "step": 3750 }, { "epoch": 0.11911111111111111, "grad_norm": 0.2158203125, "learning_rate": 0.1, "loss": 2.337406635284424, "step": 3752 }, { "epoch": 0.11917460317460317, "grad_norm": 0.21484375, "learning_rate": 0.1, "loss": 2.3717777729034424, "step": 3754 }, { "epoch": 0.11923809523809524, "grad_norm": 0.1064453125, "learning_rate": 0.1, "loss": 2.3519089221954346, "step": 3756 }, { "epoch": 0.1193015873015873, "grad_norm": 0.20703125, "learning_rate": 0.1, "loss": 2.3558361530303955, "step": 3758 }, { "epoch": 0.11936507936507937, "grad_norm": 0.1923828125, "learning_rate": 0.1, "loss": 2.3396553993225098, "step": 3760 }, { "epoch": 0.11942857142857143, "grad_norm": 0.208984375, "learning_rate": 0.1, "loss": 2.328169107437134, "step": 3762 }, { "epoch": 0.1194920634920635, "grad_norm": 0.2890625, "learning_rate": 0.1, "loss": 2.3516294956207275, "step": 3764 }, { "epoch": 0.11955555555555555, "grad_norm": 0.0595703125, "learning_rate": 0.1, "loss": 2.365053176879883, "step": 3766 }, { "epoch": 0.11961904761904762, "grad_norm": 0.1376953125, "learning_rate": 0.1, "loss": 2.334383487701416, "step": 3768 }, { "epoch": 0.11968253968253968, "grad_norm": 0.1376953125, "learning_rate": 0.1, "loss": 2.336587905883789, "step": 3770 }, { "epoch": 0.11974603174603174, "grad_norm": 0.1796875, "learning_rate": 0.1, "loss": 2.352851152420044, "step": 3772 }, { "epoch": 0.11980952380952381, "grad_norm": 0.11328125, "learning_rate": 0.1, "loss": 2.3392863273620605, "step": 3774 }, { "epoch": 0.11987301587301587, "grad_norm": 0.2265625, "learning_rate": 0.1, "loss": 2.371344566345215, "step": 3776 }, { "epoch": 0.11993650793650794, "grad_norm": 0.275390625, "learning_rate": 0.1, "loss": 2.341357707977295, "step": 3778 }, { "epoch": 0.12, "grad_norm": 0.2099609375, "learning_rate": 0.1, "loss": 2.366474151611328, "step": 3780 }, { "epoch": 0.12006349206349207, "grad_norm": 0.09619140625, "learning_rate": 0.1, "loss": 2.348088264465332, "step": 3782 }, { "epoch": 0.12012698412698412, "grad_norm": 0.1689453125, "learning_rate": 0.1, "loss": 2.3548367023468018, "step": 3784 }, { "epoch": 0.1201904761904762, "grad_norm": 0.115234375, "learning_rate": 0.1, "loss": 2.321396589279175, "step": 3786 }, { "epoch": 0.12025396825396825, "grad_norm": 0.37109375, "learning_rate": 0.1, "loss": 2.335021734237671, "step": 3788 }, { "epoch": 0.12031746031746032, "grad_norm": 0.3125, "learning_rate": 0.1, "loss": 2.3532278537750244, "step": 3790 }, { "epoch": 0.12038095238095238, "grad_norm": 0.1953125, "learning_rate": 0.1, "loss": 2.3675591945648193, "step": 3792 }, { "epoch": 0.12044444444444445, "grad_norm": 0.1669921875, "learning_rate": 0.1, "loss": 2.369569778442383, "step": 3794 }, { "epoch": 0.12050793650793651, "grad_norm": 0.13671875, "learning_rate": 0.1, "loss": 2.3393490314483643, "step": 3796 }, { "epoch": 0.12057142857142857, "grad_norm": 0.2001953125, "learning_rate": 0.1, "loss": 2.3475990295410156, "step": 3798 }, { "epoch": 0.12063492063492064, "grad_norm": 0.3671875, "learning_rate": 0.1, "loss": 2.361419200897217, "step": 3800 }, { "epoch": 0.1206984126984127, "grad_norm": 0.1669921875, "learning_rate": 0.1, "loss": 2.3742730617523193, "step": 3802 }, { "epoch": 0.12076190476190476, "grad_norm": 0.107421875, "learning_rate": 0.1, "loss": 2.360008955001831, "step": 3804 }, { "epoch": 0.12082539682539682, "grad_norm": 0.1044921875, "learning_rate": 0.1, "loss": 2.349346876144409, "step": 3806 }, { "epoch": 0.12088888888888889, "grad_norm": 0.263671875, "learning_rate": 0.1, "loss": 2.355668783187866, "step": 3808 }, { "epoch": 0.12095238095238095, "grad_norm": 0.169921875, "learning_rate": 0.1, "loss": 2.346492290496826, "step": 3810 }, { "epoch": 0.12101587301587302, "grad_norm": 0.240234375, "learning_rate": 0.1, "loss": 2.3524973392486572, "step": 3812 }, { "epoch": 0.12107936507936508, "grad_norm": 0.33984375, "learning_rate": 0.1, "loss": 2.309561014175415, "step": 3814 }, { "epoch": 0.12114285714285715, "grad_norm": 0.1484375, "learning_rate": 0.1, "loss": 2.364302635192871, "step": 3816 }, { "epoch": 0.1212063492063492, "grad_norm": 0.0947265625, "learning_rate": 0.1, "loss": 2.3511335849761963, "step": 3818 }, { "epoch": 0.12126984126984126, "grad_norm": 0.0888671875, "learning_rate": 0.1, "loss": 2.350064992904663, "step": 3820 }, { "epoch": 0.12133333333333333, "grad_norm": 0.0830078125, "learning_rate": 0.1, "loss": 2.373480796813965, "step": 3822 }, { "epoch": 0.12139682539682539, "grad_norm": 0.2197265625, "learning_rate": 0.1, "loss": 2.379816770553589, "step": 3824 }, { "epoch": 0.12146031746031746, "grad_norm": 0.326171875, "learning_rate": 0.1, "loss": 2.3490331172943115, "step": 3826 }, { "epoch": 0.12152380952380952, "grad_norm": 0.1669921875, "learning_rate": 0.1, "loss": 2.3347930908203125, "step": 3828 }, { "epoch": 0.12158730158730159, "grad_norm": 0.11474609375, "learning_rate": 0.1, "loss": 2.367295742034912, "step": 3830 }, { "epoch": 0.12165079365079365, "grad_norm": 0.1826171875, "learning_rate": 0.1, "loss": 2.346893548965454, "step": 3832 }, { "epoch": 0.12171428571428572, "grad_norm": 0.146484375, "learning_rate": 0.1, "loss": 2.3683176040649414, "step": 3834 }, { "epoch": 0.12177777777777778, "grad_norm": 0.1640625, "learning_rate": 0.1, "loss": 2.357957124710083, "step": 3836 }, { "epoch": 0.12184126984126985, "grad_norm": 0.318359375, "learning_rate": 0.1, "loss": 2.338038444519043, "step": 3838 }, { "epoch": 0.1219047619047619, "grad_norm": 0.28515625, "learning_rate": 0.1, "loss": 2.3282175064086914, "step": 3840 }, { "epoch": 0.12196825396825396, "grad_norm": 0.08154296875, "learning_rate": 0.1, "loss": 2.3441269397735596, "step": 3842 }, { "epoch": 0.12203174603174603, "grad_norm": 0.255859375, "learning_rate": 0.1, "loss": 2.3521230220794678, "step": 3844 }, { "epoch": 0.12209523809523809, "grad_norm": 0.1826171875, "learning_rate": 0.1, "loss": 2.3710358142852783, "step": 3846 }, { "epoch": 0.12215873015873016, "grad_norm": 0.3359375, "learning_rate": 0.1, "loss": 2.3646812438964844, "step": 3848 }, { "epoch": 0.12222222222222222, "grad_norm": 0.23046875, "learning_rate": 0.1, "loss": 2.3515796661376953, "step": 3850 }, { "epoch": 0.12228571428571429, "grad_norm": 0.06689453125, "learning_rate": 0.1, "loss": 2.361161231994629, "step": 3852 }, { "epoch": 0.12234920634920635, "grad_norm": 0.0634765625, "learning_rate": 0.1, "loss": 2.3432798385620117, "step": 3854 }, { "epoch": 0.12241269841269842, "grad_norm": 0.126953125, "learning_rate": 0.1, "loss": 2.350728750228882, "step": 3856 }, { "epoch": 0.12247619047619047, "grad_norm": 0.1728515625, "learning_rate": 0.1, "loss": 2.3538618087768555, "step": 3858 }, { "epoch": 0.12253968253968255, "grad_norm": 0.208984375, "learning_rate": 0.1, "loss": 2.3678226470947266, "step": 3860 }, { "epoch": 0.1226031746031746, "grad_norm": 0.2353515625, "learning_rate": 0.1, "loss": 2.3657522201538086, "step": 3862 }, { "epoch": 0.12266666666666666, "grad_norm": 0.2373046875, "learning_rate": 0.1, "loss": 2.3352084159851074, "step": 3864 }, { "epoch": 0.12273015873015873, "grad_norm": 0.224609375, "learning_rate": 0.1, "loss": 2.350445032119751, "step": 3866 }, { "epoch": 0.12279365079365079, "grad_norm": 0.07275390625, "learning_rate": 0.1, "loss": 2.337188720703125, "step": 3868 }, { "epoch": 0.12285714285714286, "grad_norm": 0.22265625, "learning_rate": 0.1, "loss": 2.3634109497070312, "step": 3870 }, { "epoch": 0.12292063492063492, "grad_norm": 0.16015625, "learning_rate": 0.1, "loss": 2.344081401824951, "step": 3872 }, { "epoch": 0.12298412698412699, "grad_norm": 0.41015625, "learning_rate": 0.1, "loss": 2.3295137882232666, "step": 3874 }, { "epoch": 0.12304761904761904, "grad_norm": 0.28515625, "learning_rate": 0.1, "loss": 2.3418519496917725, "step": 3876 }, { "epoch": 0.12311111111111112, "grad_norm": 0.1376953125, "learning_rate": 0.1, "loss": 2.3524341583251953, "step": 3878 }, { "epoch": 0.12317460317460317, "grad_norm": 0.09716796875, "learning_rate": 0.1, "loss": 2.3485474586486816, "step": 3880 }, { "epoch": 0.12323809523809524, "grad_norm": 0.203125, "learning_rate": 0.1, "loss": 2.3329694271087646, "step": 3882 }, { "epoch": 0.1233015873015873, "grad_norm": 0.267578125, "learning_rate": 0.1, "loss": 2.340209484100342, "step": 3884 }, { "epoch": 0.12336507936507937, "grad_norm": 0.115234375, "learning_rate": 0.1, "loss": 2.3338372707366943, "step": 3886 }, { "epoch": 0.12342857142857143, "grad_norm": 0.060546875, "learning_rate": 0.1, "loss": 2.3766210079193115, "step": 3888 }, { "epoch": 0.12349206349206349, "grad_norm": 0.10791015625, "learning_rate": 0.1, "loss": 2.334059000015259, "step": 3890 }, { "epoch": 0.12355555555555556, "grad_norm": 0.1826171875, "learning_rate": 0.1, "loss": 2.36077618598938, "step": 3892 }, { "epoch": 0.12361904761904761, "grad_norm": 0.3828125, "learning_rate": 0.1, "loss": 2.331653356552124, "step": 3894 }, { "epoch": 0.12368253968253969, "grad_norm": 0.296875, "learning_rate": 0.1, "loss": 2.3405730724334717, "step": 3896 }, { "epoch": 0.12374603174603174, "grad_norm": 0.12109375, "learning_rate": 0.1, "loss": 2.3480072021484375, "step": 3898 }, { "epoch": 0.12380952380952381, "grad_norm": 0.080078125, "learning_rate": 0.1, "loss": 2.3456244468688965, "step": 3900 }, { "epoch": 0.12387301587301587, "grad_norm": 0.0859375, "learning_rate": 0.1, "loss": 2.3401095867156982, "step": 3902 }, { "epoch": 0.12393650793650794, "grad_norm": 0.1865234375, "learning_rate": 0.1, "loss": 2.2982754707336426, "step": 3904 }, { "epoch": 0.124, "grad_norm": 0.06787109375, "learning_rate": 0.1, "loss": 2.372802495956421, "step": 3906 }, { "epoch": 0.12406349206349207, "grad_norm": 0.13671875, "learning_rate": 0.1, "loss": 2.3170087337493896, "step": 3908 }, { "epoch": 0.12412698412698413, "grad_norm": 0.2412109375, "learning_rate": 0.1, "loss": 2.314115285873413, "step": 3910 }, { "epoch": 0.12419047619047618, "grad_norm": 0.2294921875, "learning_rate": 0.1, "loss": 2.3621058464050293, "step": 3912 }, { "epoch": 0.12425396825396826, "grad_norm": 0.1640625, "learning_rate": 0.1, "loss": 2.324068784713745, "step": 3914 }, { "epoch": 0.12431746031746031, "grad_norm": 0.42578125, "learning_rate": 0.1, "loss": 2.33699107170105, "step": 3916 }, { "epoch": 0.12438095238095238, "grad_norm": 0.248046875, "learning_rate": 0.1, "loss": 2.3425543308258057, "step": 3918 }, { "epoch": 0.12444444444444444, "grad_norm": 0.1728515625, "learning_rate": 0.1, "loss": 2.353814125061035, "step": 3920 }, { "epoch": 0.12450793650793651, "grad_norm": 0.08837890625, "learning_rate": 0.1, "loss": 2.3523106575012207, "step": 3922 }, { "epoch": 0.12457142857142857, "grad_norm": 0.06591796875, "learning_rate": 0.1, "loss": 2.336780548095703, "step": 3924 }, { "epoch": 0.12463492063492064, "grad_norm": 0.11181640625, "learning_rate": 0.1, "loss": 2.3120875358581543, "step": 3926 }, { "epoch": 0.1246984126984127, "grad_norm": 0.25390625, "learning_rate": 0.1, "loss": 2.323986768722534, "step": 3928 }, { "epoch": 0.12476190476190477, "grad_norm": 0.1748046875, "learning_rate": 0.1, "loss": 2.3211686611175537, "step": 3930 }, { "epoch": 0.12482539682539683, "grad_norm": 0.2265625, "learning_rate": 0.1, "loss": 2.3144755363464355, "step": 3932 }, { "epoch": 0.12488888888888888, "grad_norm": 0.255859375, "learning_rate": 0.1, "loss": 2.322165012359619, "step": 3934 }, { "epoch": 0.12495238095238095, "grad_norm": 0.287109375, "learning_rate": 0.1, "loss": 2.326829433441162, "step": 3936 }, { "epoch": 0.12501587301587302, "grad_norm": 0.0849609375, "learning_rate": 0.1, "loss": 2.3243355751037598, "step": 3938 }, { "epoch": 0.12507936507936507, "grad_norm": 0.0966796875, "learning_rate": 0.1, "loss": 2.332132577896118, "step": 3940 }, { "epoch": 0.12514285714285714, "grad_norm": 0.091796875, "learning_rate": 0.1, "loss": 2.311143636703491, "step": 3942 }, { "epoch": 0.1252063492063492, "grad_norm": 0.09423828125, "learning_rate": 0.1, "loss": 2.3078866004943848, "step": 3944 }, { "epoch": 0.12526984126984128, "grad_norm": 0.2890625, "learning_rate": 0.1, "loss": 2.3233587741851807, "step": 3946 }, { "epoch": 0.12533333333333332, "grad_norm": 0.1748046875, "learning_rate": 0.1, "loss": 2.3183767795562744, "step": 3948 }, { "epoch": 0.1253968253968254, "grad_norm": 0.232421875, "learning_rate": 0.1, "loss": 2.308122396469116, "step": 3950 }, { "epoch": 0.12546031746031747, "grad_norm": 0.15625, "learning_rate": 0.1, "loss": 2.3313615322113037, "step": 3952 }, { "epoch": 0.12552380952380954, "grad_norm": 0.0703125, "learning_rate": 0.1, "loss": 2.309504508972168, "step": 3954 }, { "epoch": 0.12558730158730158, "grad_norm": 0.1416015625, "learning_rate": 0.1, "loss": 2.278714656829834, "step": 3956 }, { "epoch": 0.12565079365079365, "grad_norm": 0.3125, "learning_rate": 0.1, "loss": 2.329615354537964, "step": 3958 }, { "epoch": 0.12571428571428572, "grad_norm": 0.216796875, "learning_rate": 0.1, "loss": 2.3614537715911865, "step": 3960 }, { "epoch": 0.12577777777777777, "grad_norm": 0.138671875, "learning_rate": 0.1, "loss": 2.3237953186035156, "step": 3962 }, { "epoch": 0.12584126984126984, "grad_norm": 0.390625, "learning_rate": 0.1, "loss": 2.315549373626709, "step": 3964 }, { "epoch": 0.1259047619047619, "grad_norm": 0.302734375, "learning_rate": 0.1, "loss": 2.302813768386841, "step": 3966 }, { "epoch": 0.12596825396825398, "grad_norm": 0.07373046875, "learning_rate": 0.1, "loss": 2.329955577850342, "step": 3968 }, { "epoch": 0.12603174603174602, "grad_norm": 0.1181640625, "learning_rate": 0.1, "loss": 2.3176984786987305, "step": 3970 }, { "epoch": 0.1260952380952381, "grad_norm": 0.259765625, "learning_rate": 0.1, "loss": 2.3290371894836426, "step": 3972 }, { "epoch": 0.12615873015873016, "grad_norm": 0.047607421875, "learning_rate": 0.1, "loss": 2.293030261993408, "step": 3974 }, { "epoch": 0.12622222222222224, "grad_norm": 0.07177734375, "learning_rate": 0.1, "loss": 2.3167073726654053, "step": 3976 }, { "epoch": 0.12628571428571428, "grad_norm": 0.0859375, "learning_rate": 0.1, "loss": 2.320042133331299, "step": 3978 }, { "epoch": 0.12634920634920635, "grad_norm": 0.0703125, "learning_rate": 0.1, "loss": 2.3326938152313232, "step": 3980 }, { "epoch": 0.12641269841269842, "grad_norm": 0.07861328125, "learning_rate": 0.1, "loss": 2.3096683025360107, "step": 3982 }, { "epoch": 0.12647619047619046, "grad_norm": 0.0859375, "learning_rate": 0.1, "loss": 2.292703866958618, "step": 3984 }, { "epoch": 0.12653968253968254, "grad_norm": 0.2265625, "learning_rate": 0.1, "loss": 2.2882487773895264, "step": 3986 }, { "epoch": 0.1266031746031746, "grad_norm": 0.28125, "learning_rate": 0.1, "loss": 2.310131072998047, "step": 3988 }, { "epoch": 0.12666666666666668, "grad_norm": 0.1396484375, "learning_rate": 0.1, "loss": 2.3456428050994873, "step": 3990 }, { "epoch": 0.12673015873015872, "grad_norm": 0.2177734375, "learning_rate": 0.1, "loss": 2.32218074798584, "step": 3992 }, { "epoch": 0.1267936507936508, "grad_norm": 0.2265625, "learning_rate": 0.1, "loss": 2.3170299530029297, "step": 3994 }, { "epoch": 0.12685714285714286, "grad_norm": 0.259765625, "learning_rate": 0.1, "loss": 2.3159193992614746, "step": 3996 }, { "epoch": 0.12692063492063493, "grad_norm": 0.208984375, "learning_rate": 0.1, "loss": 2.2996907234191895, "step": 3998 }, { "epoch": 0.12698412698412698, "grad_norm": 0.0947265625, "learning_rate": 0.1, "loss": 2.2910571098327637, "step": 4000 }, { "epoch": 0.12704761904761905, "grad_norm": 0.08740234375, "learning_rate": 0.1, "loss": 2.2998952865600586, "step": 4002 }, { "epoch": 0.12711111111111112, "grad_norm": 0.08740234375, "learning_rate": 0.1, "loss": 2.2999749183654785, "step": 4004 }, { "epoch": 0.12717460317460316, "grad_norm": 0.171875, "learning_rate": 0.1, "loss": 2.293358087539673, "step": 4006 }, { "epoch": 0.12723809523809523, "grad_norm": 0.3671875, "learning_rate": 0.1, "loss": 2.309164524078369, "step": 4008 }, { "epoch": 0.1273015873015873, "grad_norm": 0.1630859375, "learning_rate": 0.1, "loss": 2.2952113151550293, "step": 4010 }, { "epoch": 0.12736507936507938, "grad_norm": 0.08544921875, "learning_rate": 0.1, "loss": 2.3047308921813965, "step": 4012 }, { "epoch": 0.12742857142857142, "grad_norm": 0.0595703125, "learning_rate": 0.1, "loss": 2.307676076889038, "step": 4014 }, { "epoch": 0.1274920634920635, "grad_norm": 0.1748046875, "learning_rate": 0.1, "loss": 2.3094213008880615, "step": 4016 }, { "epoch": 0.12755555555555556, "grad_norm": 0.1162109375, "learning_rate": 0.1, "loss": 2.293823480606079, "step": 4018 }, { "epoch": 0.12761904761904763, "grad_norm": 0.0927734375, "learning_rate": 0.1, "loss": 2.2778782844543457, "step": 4020 }, { "epoch": 0.12768253968253968, "grad_norm": 0.2392578125, "learning_rate": 0.1, "loss": 2.3053600788116455, "step": 4022 }, { "epoch": 0.12774603174603175, "grad_norm": 0.73046875, "learning_rate": 0.1, "loss": 2.313859462738037, "step": 4024 }, { "epoch": 0.12780952380952382, "grad_norm": 0.1337890625, "learning_rate": 0.1, "loss": 2.3130691051483154, "step": 4026 }, { "epoch": 0.12787301587301586, "grad_norm": 0.0869140625, "learning_rate": 0.1, "loss": 2.301145553588867, "step": 4028 }, { "epoch": 0.12793650793650793, "grad_norm": 0.07470703125, "learning_rate": 0.1, "loss": 2.289602756500244, "step": 4030 }, { "epoch": 0.128, "grad_norm": 0.055908203125, "learning_rate": 0.1, "loss": 2.3151021003723145, "step": 4032 }, { "epoch": 0.12806349206349207, "grad_norm": 0.126953125, "learning_rate": 0.1, "loss": 2.318293571472168, "step": 4034 }, { "epoch": 0.12812698412698412, "grad_norm": 0.2119140625, "learning_rate": 0.1, "loss": 2.3241305351257324, "step": 4036 }, { "epoch": 0.1281904761904762, "grad_norm": 0.234375, "learning_rate": 0.1, "loss": 2.282428503036499, "step": 4038 }, { "epoch": 0.12825396825396826, "grad_norm": 0.1865234375, "learning_rate": 0.1, "loss": 2.2952351570129395, "step": 4040 }, { "epoch": 0.12831746031746033, "grad_norm": 0.07861328125, "learning_rate": 0.1, "loss": 2.3420283794403076, "step": 4042 }, { "epoch": 0.12838095238095237, "grad_norm": 0.134765625, "learning_rate": 0.1, "loss": 2.2614071369171143, "step": 4044 }, { "epoch": 0.12844444444444444, "grad_norm": 0.2138671875, "learning_rate": 0.1, "loss": 2.2823596000671387, "step": 4046 }, { "epoch": 0.12850793650793652, "grad_norm": 0.240234375, "learning_rate": 0.1, "loss": 2.320847749710083, "step": 4048 }, { "epoch": 0.12857142857142856, "grad_norm": 0.171875, "learning_rate": 0.1, "loss": 2.2951650619506836, "step": 4050 }, { "epoch": 0.12863492063492063, "grad_norm": 0.06005859375, "learning_rate": 0.1, "loss": 2.3040707111358643, "step": 4052 }, { "epoch": 0.1286984126984127, "grad_norm": 0.11376953125, "learning_rate": 0.1, "loss": 2.322395086288452, "step": 4054 }, { "epoch": 0.12876190476190477, "grad_norm": 0.10595703125, "learning_rate": 0.1, "loss": 2.312063694000244, "step": 4056 }, { "epoch": 0.12882539682539682, "grad_norm": 0.16796875, "learning_rate": 0.1, "loss": 2.301162004470825, "step": 4058 }, { "epoch": 0.1288888888888889, "grad_norm": 0.1201171875, "learning_rate": 0.1, "loss": 2.307542562484741, "step": 4060 }, { "epoch": 0.12895238095238096, "grad_norm": 0.1328125, "learning_rate": 0.1, "loss": 2.3333139419555664, "step": 4062 }, { "epoch": 0.12901587301587303, "grad_norm": 0.3671875, "learning_rate": 0.1, "loss": 2.335141897201538, "step": 4064 }, { "epoch": 0.12907936507936507, "grad_norm": 0.0654296875, "learning_rate": 0.1, "loss": 2.308548927307129, "step": 4066 }, { "epoch": 0.12914285714285714, "grad_norm": 0.1171875, "learning_rate": 0.1, "loss": 2.310283660888672, "step": 4068 }, { "epoch": 0.1292063492063492, "grad_norm": 0.1123046875, "learning_rate": 0.1, "loss": 2.2916953563690186, "step": 4070 }, { "epoch": 0.12926984126984126, "grad_norm": 0.154296875, "learning_rate": 0.1, "loss": 2.2935473918914795, "step": 4072 }, { "epoch": 0.12933333333333333, "grad_norm": 0.125, "learning_rate": 0.1, "loss": 2.296895980834961, "step": 4074 }, { "epoch": 0.1293968253968254, "grad_norm": 0.11083984375, "learning_rate": 0.1, "loss": 2.31119441986084, "step": 4076 }, { "epoch": 0.12946031746031747, "grad_norm": 0.19921875, "learning_rate": 0.1, "loss": 2.3089334964752197, "step": 4078 }, { "epoch": 0.1295238095238095, "grad_norm": 0.287109375, "learning_rate": 0.1, "loss": 2.2776029109954834, "step": 4080 }, { "epoch": 0.12958730158730158, "grad_norm": 0.447265625, "learning_rate": 0.1, "loss": 2.2961392402648926, "step": 4082 }, { "epoch": 0.12965079365079366, "grad_norm": 0.1279296875, "learning_rate": 0.1, "loss": 2.3097925186157227, "step": 4084 }, { "epoch": 0.12971428571428573, "grad_norm": 0.08447265625, "learning_rate": 0.1, "loss": 2.3098373413085938, "step": 4086 }, { "epoch": 0.12977777777777777, "grad_norm": 0.1494140625, "learning_rate": 0.1, "loss": 2.2704014778137207, "step": 4088 }, { "epoch": 0.12984126984126984, "grad_norm": 0.400390625, "learning_rate": 0.1, "loss": 2.3183772563934326, "step": 4090 }, { "epoch": 0.1299047619047619, "grad_norm": 0.2265625, "learning_rate": 0.1, "loss": 2.30275559425354, "step": 4092 }, { "epoch": 0.12996825396825396, "grad_norm": 0.06640625, "learning_rate": 0.1, "loss": 2.3035848140716553, "step": 4094 }, { "epoch": 0.13003174603174603, "grad_norm": 0.07666015625, "learning_rate": 0.1, "loss": 2.312878131866455, "step": 4096 }, { "epoch": 0.1300952380952381, "grad_norm": 0.08154296875, "learning_rate": 0.1, "loss": 2.2982823848724365, "step": 4098 }, { "epoch": 0.13015873015873017, "grad_norm": 0.12353515625, "learning_rate": 0.1, "loss": 2.3346946239471436, "step": 4100 }, { "epoch": 0.1302222222222222, "grad_norm": 0.07177734375, "learning_rate": 0.1, "loss": 2.300450563430786, "step": 4102 }, { "epoch": 0.13028571428571428, "grad_norm": 0.2236328125, "learning_rate": 0.1, "loss": 2.3050851821899414, "step": 4104 }, { "epoch": 0.13034920634920635, "grad_norm": 0.1279296875, "learning_rate": 0.1, "loss": 2.275207042694092, "step": 4106 }, { "epoch": 0.13041269841269842, "grad_norm": 0.201171875, "learning_rate": 0.1, "loss": 2.3094964027404785, "step": 4108 }, { "epoch": 0.13047619047619047, "grad_norm": 0.12060546875, "learning_rate": 0.1, "loss": 2.3019485473632812, "step": 4110 }, { "epoch": 0.13053968253968254, "grad_norm": 0.078125, "learning_rate": 0.1, "loss": 2.2928504943847656, "step": 4112 }, { "epoch": 0.1306031746031746, "grad_norm": 0.05126953125, "learning_rate": 0.1, "loss": 2.2727720737457275, "step": 4114 }, { "epoch": 0.13066666666666665, "grad_norm": 0.052001953125, "learning_rate": 0.1, "loss": 2.314117670059204, "step": 4116 }, { "epoch": 0.13073015873015872, "grad_norm": 0.2470703125, "learning_rate": 0.1, "loss": 2.3207216262817383, "step": 4118 }, { "epoch": 0.1307936507936508, "grad_norm": 0.39453125, "learning_rate": 0.1, "loss": 2.285189151763916, "step": 4120 }, { "epoch": 0.13085714285714287, "grad_norm": 0.09130859375, "learning_rate": 0.1, "loss": 2.308685779571533, "step": 4122 }, { "epoch": 0.1309206349206349, "grad_norm": 0.216796875, "learning_rate": 0.1, "loss": 2.298062324523926, "step": 4124 }, { "epoch": 0.13098412698412698, "grad_norm": 0.244140625, "learning_rate": 0.1, "loss": 2.3307363986968994, "step": 4126 }, { "epoch": 0.13104761904761905, "grad_norm": 0.08203125, "learning_rate": 0.1, "loss": 2.325293779373169, "step": 4128 }, { "epoch": 0.13111111111111112, "grad_norm": 0.08447265625, "learning_rate": 0.1, "loss": 2.3287625312805176, "step": 4130 }, { "epoch": 0.13117460317460317, "grad_norm": 0.21484375, "learning_rate": 0.1, "loss": 2.3046464920043945, "step": 4132 }, { "epoch": 0.13123809523809524, "grad_norm": 0.17578125, "learning_rate": 0.1, "loss": 2.2857155799865723, "step": 4134 }, { "epoch": 0.1313015873015873, "grad_norm": 0.1064453125, "learning_rate": 0.1, "loss": 2.3150408267974854, "step": 4136 }, { "epoch": 0.13136507936507935, "grad_norm": 0.1552734375, "learning_rate": 0.1, "loss": 2.2846455574035645, "step": 4138 }, { "epoch": 0.13142857142857142, "grad_norm": 0.08447265625, "learning_rate": 0.1, "loss": 2.2941555976867676, "step": 4140 }, { "epoch": 0.1314920634920635, "grad_norm": 0.0732421875, "learning_rate": 0.1, "loss": 2.310887098312378, "step": 4142 }, { "epoch": 0.13155555555555556, "grad_norm": 0.1357421875, "learning_rate": 0.1, "loss": 2.302220582962036, "step": 4144 }, { "epoch": 0.1316190476190476, "grad_norm": 0.20703125, "learning_rate": 0.1, "loss": 2.3110568523406982, "step": 4146 }, { "epoch": 0.13168253968253968, "grad_norm": 0.1240234375, "learning_rate": 0.1, "loss": 2.3018226623535156, "step": 4148 }, { "epoch": 0.13174603174603175, "grad_norm": 0.35546875, "learning_rate": 0.1, "loss": 2.2848658561706543, "step": 4150 }, { "epoch": 0.13180952380952382, "grad_norm": 0.4921875, "learning_rate": 0.1, "loss": 2.3013007640838623, "step": 4152 }, { "epoch": 0.13187301587301586, "grad_norm": 0.1201171875, "learning_rate": 0.1, "loss": 2.3179543018341064, "step": 4154 }, { "epoch": 0.13193650793650794, "grad_norm": 0.07568359375, "learning_rate": 0.1, "loss": 2.294118642807007, "step": 4156 }, { "epoch": 0.132, "grad_norm": 0.119140625, "learning_rate": 0.1, "loss": 2.284237861633301, "step": 4158 }, { "epoch": 0.13206349206349208, "grad_norm": 0.251953125, "learning_rate": 0.1, "loss": 2.2860934734344482, "step": 4160 }, { "epoch": 0.13212698412698412, "grad_norm": 0.333984375, "learning_rate": 0.1, "loss": 2.2900962829589844, "step": 4162 }, { "epoch": 0.1321904761904762, "grad_norm": 0.23828125, "learning_rate": 0.1, "loss": 2.2966268062591553, "step": 4164 }, { "epoch": 0.13225396825396826, "grad_norm": 0.1337890625, "learning_rate": 0.1, "loss": 2.303171157836914, "step": 4166 }, { "epoch": 0.1323174603174603, "grad_norm": 0.05322265625, "learning_rate": 0.1, "loss": 2.279351234436035, "step": 4168 }, { "epoch": 0.13238095238095238, "grad_norm": 0.12255859375, "learning_rate": 0.1, "loss": 2.280440092086792, "step": 4170 }, { "epoch": 0.13244444444444445, "grad_norm": 0.171875, "learning_rate": 0.1, "loss": 2.302543878555298, "step": 4172 }, { "epoch": 0.13250793650793652, "grad_norm": 0.279296875, "learning_rate": 0.1, "loss": 2.2851407527923584, "step": 4174 }, { "epoch": 0.13257142857142856, "grad_norm": 0.1494140625, "learning_rate": 0.1, "loss": 2.288750171661377, "step": 4176 }, { "epoch": 0.13263492063492063, "grad_norm": 0.12890625, "learning_rate": 0.1, "loss": 2.2929165363311768, "step": 4178 }, { "epoch": 0.1326984126984127, "grad_norm": 0.07421875, "learning_rate": 0.1, "loss": 2.2893130779266357, "step": 4180 }, { "epoch": 0.13276190476190478, "grad_norm": 0.049560546875, "learning_rate": 0.1, "loss": 2.2844560146331787, "step": 4182 }, { "epoch": 0.13282539682539682, "grad_norm": 0.09716796875, "learning_rate": 0.1, "loss": 2.295564889907837, "step": 4184 }, { "epoch": 0.1328888888888889, "grad_norm": 0.2890625, "learning_rate": 0.1, "loss": 2.30666446685791, "step": 4186 }, { "epoch": 0.13295238095238096, "grad_norm": 0.2734375, "learning_rate": 0.1, "loss": 2.285140037536621, "step": 4188 }, { "epoch": 0.133015873015873, "grad_norm": 0.09033203125, "learning_rate": 0.1, "loss": 2.260340452194214, "step": 4190 }, { "epoch": 0.13307936507936508, "grad_norm": 0.051513671875, "learning_rate": 0.1, "loss": 2.273613452911377, "step": 4192 }, { "epoch": 0.13314285714285715, "grad_norm": 0.119140625, "learning_rate": 0.1, "loss": 2.2992799282073975, "step": 4194 }, { "epoch": 0.13320634920634922, "grad_norm": 0.25390625, "learning_rate": 0.1, "loss": 2.3055553436279297, "step": 4196 }, { "epoch": 0.13326984126984126, "grad_norm": 0.462890625, "learning_rate": 0.1, "loss": 2.2987685203552246, "step": 4198 }, { "epoch": 0.13333333333333333, "grad_norm": 0.10400390625, "learning_rate": 0.1, "loss": 2.3145604133605957, "step": 4200 }, { "epoch": 0.1333968253968254, "grad_norm": 0.061279296875, "learning_rate": 0.1, "loss": 2.281442880630493, "step": 4202 }, { "epoch": 0.13346031746031747, "grad_norm": 0.0849609375, "learning_rate": 0.1, "loss": 2.293602705001831, "step": 4204 }, { "epoch": 0.13352380952380952, "grad_norm": 0.0888671875, "learning_rate": 0.1, "loss": 2.286525249481201, "step": 4206 }, { "epoch": 0.1335873015873016, "grad_norm": 0.06982421875, "learning_rate": 0.1, "loss": 2.2906503677368164, "step": 4208 }, { "epoch": 0.13365079365079366, "grad_norm": 0.181640625, "learning_rate": 0.1, "loss": 2.292182207107544, "step": 4210 }, { "epoch": 0.1337142857142857, "grad_norm": 0.255859375, "learning_rate": 0.1, "loss": 2.3163421154022217, "step": 4212 }, { "epoch": 0.13377777777777777, "grad_norm": 0.31640625, "learning_rate": 0.1, "loss": 2.2938454151153564, "step": 4214 }, { "epoch": 0.13384126984126984, "grad_norm": 0.1923828125, "learning_rate": 0.1, "loss": 2.2806191444396973, "step": 4216 }, { "epoch": 0.13390476190476192, "grad_norm": 0.181640625, "learning_rate": 0.1, "loss": 2.2990617752075195, "step": 4218 }, { "epoch": 0.13396825396825396, "grad_norm": 0.306640625, "learning_rate": 0.1, "loss": 2.299926280975342, "step": 4220 }, { "epoch": 0.13403174603174603, "grad_norm": 0.271484375, "learning_rate": 0.1, "loss": 2.3058290481567383, "step": 4222 }, { "epoch": 0.1340952380952381, "grad_norm": 0.09765625, "learning_rate": 0.1, "loss": 2.27773380279541, "step": 4224 }, { "epoch": 0.13415873015873017, "grad_norm": 0.06591796875, "learning_rate": 0.1, "loss": 2.2853586673736572, "step": 4226 }, { "epoch": 0.13422222222222221, "grad_norm": 0.181640625, "learning_rate": 0.1, "loss": 2.2966268062591553, "step": 4228 }, { "epoch": 0.13428571428571429, "grad_norm": 0.18359375, "learning_rate": 0.1, "loss": 2.3038277626037598, "step": 4230 }, { "epoch": 0.13434920634920636, "grad_norm": 0.1318359375, "learning_rate": 0.1, "loss": 2.2866132259368896, "step": 4232 }, { "epoch": 0.1344126984126984, "grad_norm": 0.1552734375, "learning_rate": 0.1, "loss": 2.2692346572875977, "step": 4234 }, { "epoch": 0.13447619047619047, "grad_norm": 0.109375, "learning_rate": 0.1, "loss": 2.3025224208831787, "step": 4236 }, { "epoch": 0.13453968253968254, "grad_norm": 0.232421875, "learning_rate": 0.1, "loss": 2.293996572494507, "step": 4238 }, { "epoch": 0.1346031746031746, "grad_norm": 0.35546875, "learning_rate": 0.1, "loss": 2.293945789337158, "step": 4240 }, { "epoch": 0.13466666666666666, "grad_norm": 0.12255859375, "learning_rate": 0.1, "loss": 2.2836687564849854, "step": 4242 }, { "epoch": 0.13473015873015873, "grad_norm": 0.1484375, "learning_rate": 0.1, "loss": 2.2954463958740234, "step": 4244 }, { "epoch": 0.1347936507936508, "grad_norm": 0.1806640625, "learning_rate": 0.1, "loss": 2.28654408454895, "step": 4246 }, { "epoch": 0.13485714285714287, "grad_norm": 0.16015625, "learning_rate": 0.1, "loss": 2.2671289443969727, "step": 4248 }, { "epoch": 0.1349206349206349, "grad_norm": 0.17578125, "learning_rate": 0.1, "loss": 2.2932395935058594, "step": 4250 }, { "epoch": 0.13498412698412698, "grad_norm": 0.16796875, "learning_rate": 0.1, "loss": 2.2669942378997803, "step": 4252 }, { "epoch": 0.13504761904761906, "grad_norm": 0.08935546875, "learning_rate": 0.1, "loss": 2.285125494003296, "step": 4254 }, { "epoch": 0.1351111111111111, "grad_norm": 0.1708984375, "learning_rate": 0.1, "loss": 2.2941782474517822, "step": 4256 }, { "epoch": 0.13517460317460317, "grad_norm": 0.205078125, "learning_rate": 0.1, "loss": 2.2807202339172363, "step": 4258 }, { "epoch": 0.13523809523809524, "grad_norm": 0.08935546875, "learning_rate": 0.1, "loss": 2.279536247253418, "step": 4260 }, { "epoch": 0.1353015873015873, "grad_norm": 0.28515625, "learning_rate": 0.1, "loss": 2.2860279083251953, "step": 4262 }, { "epoch": 0.13536507936507935, "grad_norm": 0.2578125, "learning_rate": 0.1, "loss": 2.3012378215789795, "step": 4264 }, { "epoch": 0.13542857142857143, "grad_norm": 0.173828125, "learning_rate": 0.1, "loss": 2.2941019535064697, "step": 4266 }, { "epoch": 0.1354920634920635, "grad_norm": 0.1337890625, "learning_rate": 0.1, "loss": 2.2912137508392334, "step": 4268 }, { "epoch": 0.13555555555555557, "grad_norm": 0.212890625, "learning_rate": 0.1, "loss": 2.287700653076172, "step": 4270 }, { "epoch": 0.1356190476190476, "grad_norm": 0.0986328125, "learning_rate": 0.1, "loss": 2.308756113052368, "step": 4272 }, { "epoch": 0.13568253968253968, "grad_norm": 0.12451171875, "learning_rate": 0.1, "loss": 2.306694507598877, "step": 4274 }, { "epoch": 0.13574603174603175, "grad_norm": 0.162109375, "learning_rate": 0.1, "loss": 2.2836365699768066, "step": 4276 }, { "epoch": 0.1358095238095238, "grad_norm": 0.2060546875, "learning_rate": 0.1, "loss": 2.280918836593628, "step": 4278 }, { "epoch": 0.13587301587301587, "grad_norm": 0.359375, "learning_rate": 0.1, "loss": 2.276252031326294, "step": 4280 }, { "epoch": 0.13593650793650794, "grad_norm": 0.37109375, "learning_rate": 0.1, "loss": 2.2948453426361084, "step": 4282 }, { "epoch": 0.136, "grad_norm": 0.1640625, "learning_rate": 0.1, "loss": 2.2844502925872803, "step": 4284 }, { "epoch": 0.13606349206349205, "grad_norm": 0.208984375, "learning_rate": 0.1, "loss": 2.2895216941833496, "step": 4286 }, { "epoch": 0.13612698412698412, "grad_norm": 0.1181640625, "learning_rate": 0.1, "loss": 2.302628517150879, "step": 4288 }, { "epoch": 0.1361904761904762, "grad_norm": 0.06787109375, "learning_rate": 0.1, "loss": 2.259981632232666, "step": 4290 }, { "epoch": 0.13625396825396827, "grad_norm": 0.06201171875, "learning_rate": 0.1, "loss": 2.2828409671783447, "step": 4292 }, { "epoch": 0.1363174603174603, "grad_norm": 0.08056640625, "learning_rate": 0.1, "loss": 2.2846052646636963, "step": 4294 }, { "epoch": 0.13638095238095238, "grad_norm": 0.205078125, "learning_rate": 0.1, "loss": 2.2817633152008057, "step": 4296 }, { "epoch": 0.13644444444444445, "grad_norm": 0.1845703125, "learning_rate": 0.1, "loss": 2.277045249938965, "step": 4298 }, { "epoch": 0.1365079365079365, "grad_norm": 0.1953125, "learning_rate": 0.1, "loss": 2.25596022605896, "step": 4300 }, { "epoch": 0.13657142857142857, "grad_norm": 0.255859375, "learning_rate": 0.1, "loss": 2.291978597640991, "step": 4302 }, { "epoch": 0.13663492063492064, "grad_norm": 0.306640625, "learning_rate": 0.1, "loss": 2.3036794662475586, "step": 4304 }, { "epoch": 0.1366984126984127, "grad_norm": 0.193359375, "learning_rate": 0.1, "loss": 2.266505479812622, "step": 4306 }, { "epoch": 0.13676190476190475, "grad_norm": 0.06689453125, "learning_rate": 0.1, "loss": 2.2860865592956543, "step": 4308 }, { "epoch": 0.13682539682539682, "grad_norm": 0.1318359375, "learning_rate": 0.1, "loss": 2.3280622959136963, "step": 4310 }, { "epoch": 0.1368888888888889, "grad_norm": 0.1845703125, "learning_rate": 0.1, "loss": 2.2800838947296143, "step": 4312 }, { "epoch": 0.13695238095238096, "grad_norm": 0.091796875, "learning_rate": 0.1, "loss": 2.299053907394409, "step": 4314 }, { "epoch": 0.137015873015873, "grad_norm": 0.2734375, "learning_rate": 0.1, "loss": 2.294050931930542, "step": 4316 }, { "epoch": 0.13707936507936508, "grad_norm": 0.296875, "learning_rate": 0.1, "loss": 2.276416778564453, "step": 4318 }, { "epoch": 0.13714285714285715, "grad_norm": 0.1416015625, "learning_rate": 0.1, "loss": 2.2984399795532227, "step": 4320 }, { "epoch": 0.1372063492063492, "grad_norm": 0.2216796875, "learning_rate": 0.1, "loss": 2.2959301471710205, "step": 4322 }, { "epoch": 0.13726984126984126, "grad_norm": 0.15625, "learning_rate": 0.1, "loss": 2.303802490234375, "step": 4324 }, { "epoch": 0.13733333333333334, "grad_norm": 0.12060546875, "learning_rate": 0.1, "loss": 2.2673087120056152, "step": 4326 }, { "epoch": 0.1373968253968254, "grad_norm": 0.1826171875, "learning_rate": 0.1, "loss": 2.3030173778533936, "step": 4328 }, { "epoch": 0.13746031746031745, "grad_norm": 0.23046875, "learning_rate": 0.1, "loss": 2.274554967880249, "step": 4330 }, { "epoch": 0.13752380952380952, "grad_norm": 0.130859375, "learning_rate": 0.1, "loss": 2.2638657093048096, "step": 4332 }, { "epoch": 0.1375873015873016, "grad_norm": 0.08984375, "learning_rate": 0.1, "loss": 2.316516160964966, "step": 4334 }, { "epoch": 0.13765079365079366, "grad_norm": 0.1611328125, "learning_rate": 0.1, "loss": 2.2553367614746094, "step": 4336 }, { "epoch": 0.1377142857142857, "grad_norm": 0.1591796875, "learning_rate": 0.1, "loss": 2.3091976642608643, "step": 4338 }, { "epoch": 0.13777777777777778, "grad_norm": 0.298828125, "learning_rate": 0.1, "loss": 2.286018133163452, "step": 4340 }, { "epoch": 0.13784126984126985, "grad_norm": 0.1748046875, "learning_rate": 0.1, "loss": 2.2955942153930664, "step": 4342 }, { "epoch": 0.1379047619047619, "grad_norm": 0.08642578125, "learning_rate": 0.1, "loss": 2.302588939666748, "step": 4344 }, { "epoch": 0.13796825396825396, "grad_norm": 0.1826171875, "learning_rate": 0.1, "loss": 2.2750494480133057, "step": 4346 }, { "epoch": 0.13803174603174603, "grad_norm": 0.30859375, "learning_rate": 0.1, "loss": 2.2970199584960938, "step": 4348 }, { "epoch": 0.1380952380952381, "grad_norm": 0.07958984375, "learning_rate": 0.1, "loss": 2.2827394008636475, "step": 4350 }, { "epoch": 0.13815873015873015, "grad_norm": 0.09130859375, "learning_rate": 0.1, "loss": 2.285839557647705, "step": 4352 }, { "epoch": 0.13822222222222222, "grad_norm": 0.0986328125, "learning_rate": 0.1, "loss": 2.285407781600952, "step": 4354 }, { "epoch": 0.1382857142857143, "grad_norm": 0.10009765625, "learning_rate": 0.1, "loss": 2.3056609630584717, "step": 4356 }, { "epoch": 0.13834920634920636, "grad_norm": 0.058837890625, "learning_rate": 0.1, "loss": 2.3059418201446533, "step": 4358 }, { "epoch": 0.1384126984126984, "grad_norm": 0.1787109375, "learning_rate": 0.1, "loss": 2.295012950897217, "step": 4360 }, { "epoch": 0.13847619047619047, "grad_norm": 0.478515625, "learning_rate": 0.1, "loss": 2.2789039611816406, "step": 4362 }, { "epoch": 0.13853968253968255, "grad_norm": 0.4140625, "learning_rate": 0.1, "loss": 2.2974021434783936, "step": 4364 }, { "epoch": 0.13860317460317462, "grad_norm": 0.134765625, "learning_rate": 0.1, "loss": 2.2909114360809326, "step": 4366 }, { "epoch": 0.13866666666666666, "grad_norm": 0.06005859375, "learning_rate": 0.1, "loss": 2.2853970527648926, "step": 4368 }, { "epoch": 0.13873015873015873, "grad_norm": 0.07861328125, "learning_rate": 0.1, "loss": 2.300413131713867, "step": 4370 }, { "epoch": 0.1387936507936508, "grad_norm": 0.0791015625, "learning_rate": 0.1, "loss": 2.285550594329834, "step": 4372 }, { "epoch": 0.13885714285714285, "grad_norm": 0.12060546875, "learning_rate": 0.1, "loss": 2.2886886596679688, "step": 4374 }, { "epoch": 0.13892063492063492, "grad_norm": 0.11083984375, "learning_rate": 0.1, "loss": 2.2868621349334717, "step": 4376 }, { "epoch": 0.138984126984127, "grad_norm": 0.2001953125, "learning_rate": 0.1, "loss": 2.2672979831695557, "step": 4378 }, { "epoch": 0.13904761904761906, "grad_norm": 0.1552734375, "learning_rate": 0.1, "loss": 2.286505937576294, "step": 4380 }, { "epoch": 0.1391111111111111, "grad_norm": 0.0810546875, "learning_rate": 0.1, "loss": 2.2528669834136963, "step": 4382 }, { "epoch": 0.13917460317460317, "grad_norm": 0.21875, "learning_rate": 0.1, "loss": 2.2721211910247803, "step": 4384 }, { "epoch": 0.13923809523809524, "grad_norm": 0.1015625, "learning_rate": 0.1, "loss": 2.2832610607147217, "step": 4386 }, { "epoch": 0.13930158730158732, "grad_norm": 0.2001953125, "learning_rate": 0.1, "loss": 2.269446611404419, "step": 4388 }, { "epoch": 0.13936507936507936, "grad_norm": 0.1064453125, "learning_rate": 0.1, "loss": 2.271932363510132, "step": 4390 }, { "epoch": 0.13942857142857143, "grad_norm": 0.11767578125, "learning_rate": 0.1, "loss": 2.2948100566864014, "step": 4392 }, { "epoch": 0.1394920634920635, "grad_norm": 0.0517578125, "learning_rate": 0.1, "loss": 2.2993826866149902, "step": 4394 }, { "epoch": 0.13955555555555554, "grad_norm": 0.1259765625, "learning_rate": 0.1, "loss": 2.25460147857666, "step": 4396 }, { "epoch": 0.13961904761904761, "grad_norm": 0.2578125, "learning_rate": 0.1, "loss": 2.292811632156372, "step": 4398 }, { "epoch": 0.13968253968253969, "grad_norm": 0.2119140625, "learning_rate": 0.1, "loss": 2.310152530670166, "step": 4400 }, { "epoch": 0.13974603174603176, "grad_norm": 0.2470703125, "learning_rate": 0.1, "loss": 2.2997281551361084, "step": 4402 }, { "epoch": 0.1398095238095238, "grad_norm": 0.06396484375, "learning_rate": 0.1, "loss": 2.2885093688964844, "step": 4404 }, { "epoch": 0.13987301587301587, "grad_norm": 0.06298828125, "learning_rate": 0.1, "loss": 2.274951696395874, "step": 4406 }, { "epoch": 0.13993650793650794, "grad_norm": 0.060302734375, "learning_rate": 0.1, "loss": 2.287485122680664, "step": 4408 }, { "epoch": 0.14, "grad_norm": 0.1630859375, "learning_rate": 0.1, "loss": 2.2850334644317627, "step": 4410 }, { "epoch": 0.14006349206349206, "grad_norm": 0.58203125, "learning_rate": 0.1, "loss": 2.296558380126953, "step": 4412 }, { "epoch": 0.14012698412698413, "grad_norm": 0.306640625, "learning_rate": 0.1, "loss": 2.274782657623291, "step": 4414 }, { "epoch": 0.1401904761904762, "grad_norm": 0.13671875, "learning_rate": 0.1, "loss": 2.2750258445739746, "step": 4416 }, { "epoch": 0.14025396825396824, "grad_norm": 0.298828125, "learning_rate": 0.1, "loss": 2.3006911277770996, "step": 4418 }, { "epoch": 0.1403174603174603, "grad_norm": 0.2236328125, "learning_rate": 0.1, "loss": 2.3234779834747314, "step": 4420 }, { "epoch": 0.14038095238095238, "grad_norm": 0.2236328125, "learning_rate": 0.1, "loss": 2.2883758544921875, "step": 4422 }, { "epoch": 0.14044444444444446, "grad_norm": 0.205078125, "learning_rate": 0.1, "loss": 2.291433095932007, "step": 4424 }, { "epoch": 0.1405079365079365, "grad_norm": 0.25390625, "learning_rate": 0.1, "loss": 2.264465808868408, "step": 4426 }, { "epoch": 0.14057142857142857, "grad_norm": 0.07421875, "learning_rate": 0.1, "loss": 2.2783806324005127, "step": 4428 }, { "epoch": 0.14063492063492064, "grad_norm": 0.1103515625, "learning_rate": 0.1, "loss": 2.2846689224243164, "step": 4430 }, { "epoch": 0.1406984126984127, "grad_norm": 0.095703125, "learning_rate": 0.1, "loss": 2.2691848278045654, "step": 4432 }, { "epoch": 0.14076190476190475, "grad_norm": 0.06298828125, "learning_rate": 0.1, "loss": 2.2851531505584717, "step": 4434 }, { "epoch": 0.14082539682539683, "grad_norm": 0.09375, "learning_rate": 0.1, "loss": 2.282208204269409, "step": 4436 }, { "epoch": 0.1408888888888889, "grad_norm": 0.162109375, "learning_rate": 0.1, "loss": 2.2620723247528076, "step": 4438 }, { "epoch": 0.14095238095238094, "grad_norm": 0.25, "learning_rate": 0.1, "loss": 2.303618907928467, "step": 4440 }, { "epoch": 0.141015873015873, "grad_norm": 0.2099609375, "learning_rate": 0.1, "loss": 2.283712863922119, "step": 4442 }, { "epoch": 0.14107936507936508, "grad_norm": 0.04150390625, "learning_rate": 0.1, "loss": 2.261486530303955, "step": 4444 }, { "epoch": 0.14114285714285715, "grad_norm": 0.259765625, "learning_rate": 0.1, "loss": 2.2939646244049072, "step": 4446 }, { "epoch": 0.1412063492063492, "grad_norm": 0.2373046875, "learning_rate": 0.1, "loss": 2.2825872898101807, "step": 4448 }, { "epoch": 0.14126984126984127, "grad_norm": 0.189453125, "learning_rate": 0.1, "loss": 2.2820680141448975, "step": 4450 }, { "epoch": 0.14133333333333334, "grad_norm": 0.1796875, "learning_rate": 0.1, "loss": 2.258233070373535, "step": 4452 }, { "epoch": 0.1413968253968254, "grad_norm": 0.1416015625, "learning_rate": 0.1, "loss": 2.2957892417907715, "step": 4454 }, { "epoch": 0.14146031746031745, "grad_norm": 0.12353515625, "learning_rate": 0.1, "loss": 2.2510557174682617, "step": 4456 }, { "epoch": 0.14152380952380952, "grad_norm": 0.12890625, "learning_rate": 0.1, "loss": 2.2840123176574707, "step": 4458 }, { "epoch": 0.1415873015873016, "grad_norm": 0.07763671875, "learning_rate": 0.1, "loss": 2.2832515239715576, "step": 4460 }, { "epoch": 0.14165079365079364, "grad_norm": 0.177734375, "learning_rate": 0.1, "loss": 2.263439655303955, "step": 4462 }, { "epoch": 0.1417142857142857, "grad_norm": 0.35546875, "learning_rate": 0.1, "loss": 2.244683265686035, "step": 4464 }, { "epoch": 0.14177777777777778, "grad_norm": 0.055419921875, "learning_rate": 0.1, "loss": 2.289228916168213, "step": 4466 }, { "epoch": 0.14184126984126985, "grad_norm": 0.1494140625, "learning_rate": 0.1, "loss": 2.2711644172668457, "step": 4468 }, { "epoch": 0.1419047619047619, "grad_norm": 0.1923828125, "learning_rate": 0.1, "loss": 2.2735097408294678, "step": 4470 }, { "epoch": 0.14196825396825397, "grad_norm": 0.310546875, "learning_rate": 0.1, "loss": 2.2618825435638428, "step": 4472 }, { "epoch": 0.14203174603174604, "grad_norm": 0.0869140625, "learning_rate": 0.1, "loss": 2.261165142059326, "step": 4474 }, { "epoch": 0.1420952380952381, "grad_norm": 0.23046875, "learning_rate": 0.1, "loss": 2.278230667114258, "step": 4476 }, { "epoch": 0.14215873015873015, "grad_norm": 0.298828125, "learning_rate": 0.1, "loss": 2.2467966079711914, "step": 4478 }, { "epoch": 0.14222222222222222, "grad_norm": 0.2421875, "learning_rate": 0.1, "loss": 2.227424144744873, "step": 4480 }, { "epoch": 0.1422857142857143, "grad_norm": 0.11328125, "learning_rate": 0.1, "loss": 2.2595136165618896, "step": 4482 }, { "epoch": 0.14234920634920634, "grad_norm": 0.134765625, "learning_rate": 0.1, "loss": 2.2476394176483154, "step": 4484 }, { "epoch": 0.1424126984126984, "grad_norm": 0.267578125, "learning_rate": 0.1, "loss": 2.2523202896118164, "step": 4486 }, { "epoch": 0.14247619047619048, "grad_norm": 0.1376953125, "learning_rate": 0.1, "loss": 2.285255193710327, "step": 4488 }, { "epoch": 0.14253968253968255, "grad_norm": 0.1728515625, "learning_rate": 0.1, "loss": 2.263206958770752, "step": 4490 }, { "epoch": 0.1426031746031746, "grad_norm": 0.1025390625, "learning_rate": 0.1, "loss": 2.2693092823028564, "step": 4492 }, { "epoch": 0.14266666666666666, "grad_norm": 0.08154296875, "learning_rate": 0.1, "loss": 2.2731518745422363, "step": 4494 }, { "epoch": 0.14273015873015873, "grad_norm": 0.0654296875, "learning_rate": 0.1, "loss": 2.2457032203674316, "step": 4496 }, { "epoch": 0.1427936507936508, "grad_norm": 0.11083984375, "learning_rate": 0.1, "loss": 2.276195764541626, "step": 4498 }, { "epoch": 0.14285714285714285, "grad_norm": 0.302734375, "learning_rate": 0.1, "loss": 2.247270345687866, "step": 4500 }, { "epoch": 0.14292063492063492, "grad_norm": 0.1435546875, "learning_rate": 0.1, "loss": 2.261371612548828, "step": 4502 }, { "epoch": 0.142984126984127, "grad_norm": 0.15625, "learning_rate": 0.1, "loss": 2.2719802856445312, "step": 4504 }, { "epoch": 0.14304761904761903, "grad_norm": 0.16796875, "learning_rate": 0.1, "loss": 2.2561147212982178, "step": 4506 }, { "epoch": 0.1431111111111111, "grad_norm": 0.19921875, "learning_rate": 0.1, "loss": 2.2542550563812256, "step": 4508 }, { "epoch": 0.14317460317460318, "grad_norm": 0.33984375, "learning_rate": 0.1, "loss": 2.2963666915893555, "step": 4510 }, { "epoch": 0.14323809523809525, "grad_norm": 0.34375, "learning_rate": 0.1, "loss": 2.2685673236846924, "step": 4512 }, { "epoch": 0.1433015873015873, "grad_norm": 0.12158203125, "learning_rate": 0.1, "loss": 2.2477807998657227, "step": 4514 }, { "epoch": 0.14336507936507936, "grad_norm": 0.0810546875, "learning_rate": 0.1, "loss": 2.2424492835998535, "step": 4516 }, { "epoch": 0.14342857142857143, "grad_norm": 0.119140625, "learning_rate": 0.1, "loss": 2.228090763092041, "step": 4518 }, { "epoch": 0.1434920634920635, "grad_norm": 0.11083984375, "learning_rate": 0.1, "loss": 2.242058515548706, "step": 4520 }, { "epoch": 0.14355555555555555, "grad_norm": 0.19140625, "learning_rate": 0.1, "loss": 2.2435553073883057, "step": 4522 }, { "epoch": 0.14361904761904762, "grad_norm": 0.11279296875, "learning_rate": 0.1, "loss": 2.248300075531006, "step": 4524 }, { "epoch": 0.1436825396825397, "grad_norm": 0.1552734375, "learning_rate": 0.1, "loss": 2.236762285232544, "step": 4526 }, { "epoch": 0.14374603174603173, "grad_norm": 0.05908203125, "learning_rate": 0.1, "loss": 2.2230749130249023, "step": 4528 }, { "epoch": 0.1438095238095238, "grad_norm": 0.0947265625, "learning_rate": 0.1, "loss": 2.235912561416626, "step": 4530 }, { "epoch": 0.14387301587301587, "grad_norm": 0.07373046875, "learning_rate": 0.1, "loss": 2.2367069721221924, "step": 4532 }, { "epoch": 0.14393650793650795, "grad_norm": 0.056396484375, "learning_rate": 0.1, "loss": 2.2447152137756348, "step": 4534 }, { "epoch": 0.144, "grad_norm": 0.04443359375, "learning_rate": 0.1, "loss": 2.2484188079833984, "step": 4536 }, { "epoch": 0.14406349206349206, "grad_norm": 0.2412109375, "learning_rate": 0.1, "loss": 2.2651255130767822, "step": 4538 }, { "epoch": 0.14412698412698413, "grad_norm": 0.21484375, "learning_rate": 0.1, "loss": 2.271829843521118, "step": 4540 }, { "epoch": 0.1441904761904762, "grad_norm": 0.1728515625, "learning_rate": 0.1, "loss": 2.258774518966675, "step": 4542 }, { "epoch": 0.14425396825396825, "grad_norm": 0.28515625, "learning_rate": 0.1, "loss": 2.2609996795654297, "step": 4544 }, { "epoch": 0.14431746031746032, "grad_norm": 0.34375, "learning_rate": 0.1, "loss": 2.2434558868408203, "step": 4546 }, { "epoch": 0.1443809523809524, "grad_norm": 0.21875, "learning_rate": 0.1, "loss": 2.2460005283355713, "step": 4548 }, { "epoch": 0.14444444444444443, "grad_norm": 0.2109375, "learning_rate": 0.1, "loss": 2.2790820598602295, "step": 4550 }, { "epoch": 0.1445079365079365, "grad_norm": 0.24609375, "learning_rate": 0.1, "loss": 2.2320797443389893, "step": 4552 }, { "epoch": 0.14457142857142857, "grad_norm": 0.291015625, "learning_rate": 0.1, "loss": 2.266356945037842, "step": 4554 }, { "epoch": 0.14463492063492064, "grad_norm": 0.291015625, "learning_rate": 0.1, "loss": 2.263770341873169, "step": 4556 }, { "epoch": 0.1446984126984127, "grad_norm": 0.08984375, "learning_rate": 0.1, "loss": 2.2677674293518066, "step": 4558 }, { "epoch": 0.14476190476190476, "grad_norm": 0.234375, "learning_rate": 0.1, "loss": 2.2430167198181152, "step": 4560 }, { "epoch": 0.14482539682539683, "grad_norm": 0.123046875, "learning_rate": 0.1, "loss": 2.2568938732147217, "step": 4562 }, { "epoch": 0.1448888888888889, "grad_norm": 0.1826171875, "learning_rate": 0.1, "loss": 2.2287280559539795, "step": 4564 }, { "epoch": 0.14495238095238094, "grad_norm": 0.0908203125, "learning_rate": 0.1, "loss": 2.251957654953003, "step": 4566 }, { "epoch": 0.14501587301587301, "grad_norm": 0.08154296875, "learning_rate": 0.1, "loss": 2.244169235229492, "step": 4568 }, { "epoch": 0.14507936507936509, "grad_norm": 0.08349609375, "learning_rate": 0.1, "loss": 2.275089740753174, "step": 4570 }, { "epoch": 0.14514285714285713, "grad_norm": 0.177734375, "learning_rate": 0.1, "loss": 2.2249627113342285, "step": 4572 }, { "epoch": 0.1452063492063492, "grad_norm": 0.1748046875, "learning_rate": 0.1, "loss": 2.2599003314971924, "step": 4574 }, { "epoch": 0.14526984126984127, "grad_norm": 0.1484375, "learning_rate": 0.1, "loss": 2.2469427585601807, "step": 4576 }, { "epoch": 0.14533333333333334, "grad_norm": 0.2216796875, "learning_rate": 0.1, "loss": 2.2576394081115723, "step": 4578 }, { "epoch": 0.14539682539682539, "grad_norm": 0.15234375, "learning_rate": 0.1, "loss": 2.259108543395996, "step": 4580 }, { "epoch": 0.14546031746031746, "grad_norm": 0.0556640625, "learning_rate": 0.1, "loss": 2.2728044986724854, "step": 4582 }, { "epoch": 0.14552380952380953, "grad_norm": 0.09423828125, "learning_rate": 0.1, "loss": 2.2337074279785156, "step": 4584 }, { "epoch": 0.1455873015873016, "grad_norm": 0.3203125, "learning_rate": 0.1, "loss": 2.2445552349090576, "step": 4586 }, { "epoch": 0.14565079365079364, "grad_norm": 0.462890625, "learning_rate": 0.1, "loss": 2.2480862140655518, "step": 4588 }, { "epoch": 0.1457142857142857, "grad_norm": 0.07177734375, "learning_rate": 0.1, "loss": 2.2571871280670166, "step": 4590 }, { "epoch": 0.14577777777777778, "grad_norm": 0.07080078125, "learning_rate": 0.1, "loss": 2.2532360553741455, "step": 4592 }, { "epoch": 0.14584126984126985, "grad_norm": 0.08984375, "learning_rate": 0.1, "loss": 2.2542994022369385, "step": 4594 }, { "epoch": 0.1459047619047619, "grad_norm": 0.1591796875, "learning_rate": 0.1, "loss": 2.2419047355651855, "step": 4596 }, { "epoch": 0.14596825396825397, "grad_norm": 0.205078125, "learning_rate": 0.1, "loss": 2.227252721786499, "step": 4598 }, { "epoch": 0.14603174603174604, "grad_norm": 0.2109375, "learning_rate": 0.1, "loss": 2.2516002655029297, "step": 4600 }, { "epoch": 0.14609523809523808, "grad_norm": 0.41796875, "learning_rate": 0.1, "loss": 2.239802360534668, "step": 4602 }, { "epoch": 0.14615873015873015, "grad_norm": 0.326171875, "learning_rate": 0.1, "loss": 2.2371981143951416, "step": 4604 }, { "epoch": 0.14622222222222223, "grad_norm": 0.1396484375, "learning_rate": 0.1, "loss": 2.229707956314087, "step": 4606 }, { "epoch": 0.1462857142857143, "grad_norm": 0.15234375, "learning_rate": 0.1, "loss": 2.2383370399475098, "step": 4608 }, { "epoch": 0.14634920634920634, "grad_norm": 0.095703125, "learning_rate": 0.1, "loss": 2.2406628131866455, "step": 4610 }, { "epoch": 0.1464126984126984, "grad_norm": 0.08447265625, "learning_rate": 0.1, "loss": 2.254128932952881, "step": 4612 }, { "epoch": 0.14647619047619048, "grad_norm": 0.0712890625, "learning_rate": 0.1, "loss": 2.273343324661255, "step": 4614 }, { "epoch": 0.14653968253968255, "grad_norm": 0.16796875, "learning_rate": 0.1, "loss": 2.2616019248962402, "step": 4616 }, { "epoch": 0.1466031746031746, "grad_norm": 0.2021484375, "learning_rate": 0.1, "loss": 2.2394556999206543, "step": 4618 }, { "epoch": 0.14666666666666667, "grad_norm": 0.1826171875, "learning_rate": 0.1, "loss": 2.2403886318206787, "step": 4620 }, { "epoch": 0.14673015873015874, "grad_norm": 0.068359375, "learning_rate": 0.1, "loss": 2.227113723754883, "step": 4622 }, { "epoch": 0.14679365079365078, "grad_norm": 0.0634765625, "learning_rate": 0.1, "loss": 2.2373292446136475, "step": 4624 }, { "epoch": 0.14685714285714285, "grad_norm": 0.126953125, "learning_rate": 0.1, "loss": 2.2468066215515137, "step": 4626 }, { "epoch": 0.14692063492063492, "grad_norm": 0.11328125, "learning_rate": 0.1, "loss": 2.256910562515259, "step": 4628 }, { "epoch": 0.146984126984127, "grad_norm": 0.1787109375, "learning_rate": 0.1, "loss": 2.2626259326934814, "step": 4630 }, { "epoch": 0.14704761904761904, "grad_norm": 0.1044921875, "learning_rate": 0.1, "loss": 2.2317845821380615, "step": 4632 }, { "epoch": 0.1471111111111111, "grad_norm": 0.0673828125, "learning_rate": 0.1, "loss": 2.275815010070801, "step": 4634 }, { "epoch": 0.14717460317460318, "grad_norm": 0.12158203125, "learning_rate": 0.1, "loss": 2.2533082962036133, "step": 4636 }, { "epoch": 0.14723809523809525, "grad_norm": 0.28515625, "learning_rate": 0.1, "loss": 2.227198362350464, "step": 4638 }, { "epoch": 0.1473015873015873, "grad_norm": 0.5078125, "learning_rate": 0.1, "loss": 2.2369890213012695, "step": 4640 }, { "epoch": 0.14736507936507937, "grad_norm": 0.150390625, "learning_rate": 0.1, "loss": 2.2406513690948486, "step": 4642 }, { "epoch": 0.14742857142857144, "grad_norm": 0.12158203125, "learning_rate": 0.1, "loss": 2.257692813873291, "step": 4644 }, { "epoch": 0.14749206349206348, "grad_norm": 0.134765625, "learning_rate": 0.1, "loss": 2.237513303756714, "step": 4646 }, { "epoch": 0.14755555555555555, "grad_norm": 0.130859375, "learning_rate": 0.1, "loss": 2.2434279918670654, "step": 4648 }, { "epoch": 0.14761904761904762, "grad_norm": 0.2578125, "learning_rate": 0.1, "loss": 2.2439849376678467, "step": 4650 }, { "epoch": 0.1476825396825397, "grad_norm": 0.25390625, "learning_rate": 0.1, "loss": 2.2447140216827393, "step": 4652 }, { "epoch": 0.14774603174603174, "grad_norm": 0.050048828125, "learning_rate": 0.1, "loss": 2.25213623046875, "step": 4654 }, { "epoch": 0.1478095238095238, "grad_norm": 0.220703125, "learning_rate": 0.1, "loss": 2.2565934658050537, "step": 4656 }, { "epoch": 0.14787301587301588, "grad_norm": 0.33203125, "learning_rate": 0.1, "loss": 2.237602710723877, "step": 4658 }, { "epoch": 0.14793650793650795, "grad_norm": 0.25390625, "learning_rate": 0.1, "loss": 2.253051996231079, "step": 4660 }, { "epoch": 0.148, "grad_norm": 0.09912109375, "learning_rate": 0.1, "loss": 2.2529561519622803, "step": 4662 }, { "epoch": 0.14806349206349206, "grad_norm": 0.11767578125, "learning_rate": 0.1, "loss": 2.24349045753479, "step": 4664 }, { "epoch": 0.14812698412698413, "grad_norm": 0.091796875, "learning_rate": 0.1, "loss": 2.2448208332061768, "step": 4666 }, { "epoch": 0.14819047619047618, "grad_norm": 0.08935546875, "learning_rate": 0.1, "loss": 2.2609755992889404, "step": 4668 }, { "epoch": 0.14825396825396825, "grad_norm": 0.08251953125, "learning_rate": 0.1, "loss": 2.2459497451782227, "step": 4670 }, { "epoch": 0.14831746031746032, "grad_norm": 0.09423828125, "learning_rate": 0.1, "loss": 2.256282091140747, "step": 4672 }, { "epoch": 0.1483809523809524, "grad_norm": 0.25, "learning_rate": 0.1, "loss": 2.2702863216400146, "step": 4674 }, { "epoch": 0.14844444444444443, "grad_norm": 0.150390625, "learning_rate": 0.1, "loss": 2.2412524223327637, "step": 4676 }, { "epoch": 0.1485079365079365, "grad_norm": 0.1611328125, "learning_rate": 0.1, "loss": 2.245201349258423, "step": 4678 }, { "epoch": 0.14857142857142858, "grad_norm": 0.2421875, "learning_rate": 0.1, "loss": 2.246119499206543, "step": 4680 }, { "epoch": 0.14863492063492065, "grad_norm": 0.1298828125, "learning_rate": 0.1, "loss": 2.2485485076904297, "step": 4682 }, { "epoch": 0.1486984126984127, "grad_norm": 0.1689453125, "learning_rate": 0.1, "loss": 2.2732627391815186, "step": 4684 }, { "epoch": 0.14876190476190476, "grad_norm": 0.1865234375, "learning_rate": 0.1, "loss": 2.246403455734253, "step": 4686 }, { "epoch": 0.14882539682539683, "grad_norm": 0.39453125, "learning_rate": 0.1, "loss": 2.265697717666626, "step": 4688 }, { "epoch": 0.14888888888888888, "grad_norm": 0.1748046875, "learning_rate": 0.1, "loss": 2.238236665725708, "step": 4690 }, { "epoch": 0.14895238095238095, "grad_norm": 0.1337890625, "learning_rate": 0.1, "loss": 2.2679128646850586, "step": 4692 }, { "epoch": 0.14901587301587302, "grad_norm": 0.087890625, "learning_rate": 0.1, "loss": 2.2524406909942627, "step": 4694 }, { "epoch": 0.1490793650793651, "grad_norm": 0.1640625, "learning_rate": 0.1, "loss": 2.237032651901245, "step": 4696 }, { "epoch": 0.14914285714285713, "grad_norm": 0.1435546875, "learning_rate": 0.1, "loss": 2.249223470687866, "step": 4698 }, { "epoch": 0.1492063492063492, "grad_norm": 0.216796875, "learning_rate": 0.1, "loss": 2.221526622772217, "step": 4700 }, { "epoch": 0.14926984126984127, "grad_norm": 0.1923828125, "learning_rate": 0.1, "loss": 2.270336151123047, "step": 4702 }, { "epoch": 0.14933333333333335, "grad_norm": 0.095703125, "learning_rate": 0.1, "loss": 2.2536163330078125, "step": 4704 }, { "epoch": 0.1493968253968254, "grad_norm": 0.099609375, "learning_rate": 0.1, "loss": 2.239260673522949, "step": 4706 }, { "epoch": 0.14946031746031746, "grad_norm": 0.0712890625, "learning_rate": 0.1, "loss": 2.2750327587127686, "step": 4708 }, { "epoch": 0.14952380952380953, "grad_norm": 0.29296875, "learning_rate": 0.1, "loss": 2.2550790309906006, "step": 4710 }, { "epoch": 0.14958730158730157, "grad_norm": 0.419921875, "learning_rate": 0.1, "loss": 2.2591044902801514, "step": 4712 }, { "epoch": 0.14965079365079365, "grad_norm": 0.078125, "learning_rate": 0.1, "loss": 2.2412943840026855, "step": 4714 }, { "epoch": 0.14971428571428572, "grad_norm": 0.07666015625, "learning_rate": 0.1, "loss": 2.284665822982788, "step": 4716 }, { "epoch": 0.1497777777777778, "grad_norm": 0.09228515625, "learning_rate": 0.1, "loss": 2.2461774349212646, "step": 4718 }, { "epoch": 0.14984126984126983, "grad_norm": 0.08984375, "learning_rate": 0.1, "loss": 2.270545482635498, "step": 4720 }, { "epoch": 0.1499047619047619, "grad_norm": 0.18359375, "learning_rate": 0.1, "loss": 2.279376268386841, "step": 4722 }, { "epoch": 0.14996825396825397, "grad_norm": 0.2138671875, "learning_rate": 0.1, "loss": 2.274411916732788, "step": 4724 }, { "epoch": 0.15003174603174604, "grad_norm": 0.2314453125, "learning_rate": 0.1, "loss": 2.252490758895874, "step": 4726 }, { "epoch": 0.1500952380952381, "grad_norm": 0.1728515625, "learning_rate": 0.1, "loss": 2.276149272918701, "step": 4728 }, { "epoch": 0.15015873015873016, "grad_norm": 0.09033203125, "learning_rate": 0.1, "loss": 2.2653772830963135, "step": 4730 }, { "epoch": 0.15022222222222223, "grad_norm": 0.26953125, "learning_rate": 0.1, "loss": 2.287546157836914, "step": 4732 }, { "epoch": 0.15028571428571427, "grad_norm": 0.240234375, "learning_rate": 0.1, "loss": 2.290461301803589, "step": 4734 }, { "epoch": 0.15034920634920634, "grad_norm": 0.2001953125, "learning_rate": 0.1, "loss": 2.233339548110962, "step": 4736 }, { "epoch": 0.15041269841269841, "grad_norm": 0.1396484375, "learning_rate": 0.1, "loss": 2.282514810562134, "step": 4738 }, { "epoch": 0.15047619047619049, "grad_norm": 0.052001953125, "learning_rate": 0.1, "loss": 2.2730605602264404, "step": 4740 }, { "epoch": 0.15053968253968253, "grad_norm": 0.05859375, "learning_rate": 0.1, "loss": 2.283360481262207, "step": 4742 }, { "epoch": 0.1506031746031746, "grad_norm": 0.20703125, "learning_rate": 0.1, "loss": 2.2638187408447266, "step": 4744 }, { "epoch": 0.15066666666666667, "grad_norm": 0.08544921875, "learning_rate": 0.1, "loss": 2.2753374576568604, "step": 4746 }, { "epoch": 0.15073015873015874, "grad_norm": 0.09130859375, "learning_rate": 0.1, "loss": 2.302623987197876, "step": 4748 }, { "epoch": 0.15079365079365079, "grad_norm": 0.29296875, "learning_rate": 0.1, "loss": 2.2814204692840576, "step": 4750 }, { "epoch": 0.15085714285714286, "grad_norm": 0.1650390625, "learning_rate": 0.1, "loss": 2.2422194480895996, "step": 4752 }, { "epoch": 0.15092063492063493, "grad_norm": 0.1923828125, "learning_rate": 0.1, "loss": 2.2660624980926514, "step": 4754 }, { "epoch": 0.15098412698412697, "grad_norm": 0.06005859375, "learning_rate": 0.1, "loss": 2.296535015106201, "step": 4756 }, { "epoch": 0.15104761904761904, "grad_norm": 0.0791015625, "learning_rate": 0.1, "loss": 2.2562804222106934, "step": 4758 }, { "epoch": 0.1511111111111111, "grad_norm": 0.1201171875, "learning_rate": 0.1, "loss": 2.274951934814453, "step": 4760 }, { "epoch": 0.15117460317460318, "grad_norm": 0.109375, "learning_rate": 0.1, "loss": 2.2625999450683594, "step": 4762 }, { "epoch": 0.15123809523809523, "grad_norm": 0.11376953125, "learning_rate": 0.1, "loss": 2.268707752227783, "step": 4764 }, { "epoch": 0.1513015873015873, "grad_norm": 0.498046875, "learning_rate": 0.1, "loss": 2.264815092086792, "step": 4766 }, { "epoch": 0.15136507936507937, "grad_norm": 0.4453125, "learning_rate": 0.1, "loss": 2.246915578842163, "step": 4768 }, { "epoch": 0.15142857142857144, "grad_norm": 0.12353515625, "learning_rate": 0.1, "loss": 2.2630462646484375, "step": 4770 }, { "epoch": 0.15149206349206348, "grad_norm": 0.12109375, "learning_rate": 0.1, "loss": 2.2530624866485596, "step": 4772 }, { "epoch": 0.15155555555555555, "grad_norm": 0.047607421875, "learning_rate": 0.1, "loss": 2.242330551147461, "step": 4774 }, { "epoch": 0.15161904761904763, "grad_norm": 0.1318359375, "learning_rate": 0.1, "loss": 2.2545742988586426, "step": 4776 }, { "epoch": 0.15168253968253967, "grad_norm": 0.1923828125, "learning_rate": 0.1, "loss": 2.26286244392395, "step": 4778 }, { "epoch": 0.15174603174603174, "grad_norm": 0.27734375, "learning_rate": 0.1, "loss": 2.259678840637207, "step": 4780 }, { "epoch": 0.1518095238095238, "grad_norm": 0.189453125, "learning_rate": 0.1, "loss": 2.267683744430542, "step": 4782 }, { "epoch": 0.15187301587301588, "grad_norm": 0.0908203125, "learning_rate": 0.1, "loss": 2.253514528274536, "step": 4784 }, { "epoch": 0.15193650793650793, "grad_norm": 0.119140625, "learning_rate": 0.1, "loss": 2.270596742630005, "step": 4786 }, { "epoch": 0.152, "grad_norm": 0.126953125, "learning_rate": 0.1, "loss": 2.2687604427337646, "step": 4788 }, { "epoch": 0.15206349206349207, "grad_norm": 0.091796875, "learning_rate": 0.1, "loss": 2.231996774673462, "step": 4790 }, { "epoch": 0.15212698412698414, "grad_norm": 0.09228515625, "learning_rate": 0.1, "loss": 2.2592649459838867, "step": 4792 }, { "epoch": 0.15219047619047618, "grad_norm": 0.1005859375, "learning_rate": 0.1, "loss": 2.2621262073516846, "step": 4794 }, { "epoch": 0.15225396825396825, "grad_norm": 0.1083984375, "learning_rate": 0.1, "loss": 2.22605037689209, "step": 4796 }, { "epoch": 0.15231746031746032, "grad_norm": 0.10400390625, "learning_rate": 0.1, "loss": 2.2744414806365967, "step": 4798 }, { "epoch": 0.1523809523809524, "grad_norm": 0.2314453125, "learning_rate": 0.1, "loss": 2.2605865001678467, "step": 4800 }, { "epoch": 0.15244444444444444, "grad_norm": 0.1865234375, "learning_rate": 0.1, "loss": 2.275714159011841, "step": 4802 }, { "epoch": 0.1525079365079365, "grad_norm": 0.24609375, "learning_rate": 0.1, "loss": 2.2558155059814453, "step": 4804 }, { "epoch": 0.15257142857142858, "grad_norm": 0.2412109375, "learning_rate": 0.1, "loss": 2.266275405883789, "step": 4806 }, { "epoch": 0.15263492063492062, "grad_norm": 0.12158203125, "learning_rate": 0.1, "loss": 2.2598376274108887, "step": 4808 }, { "epoch": 0.1526984126984127, "grad_norm": 0.1748046875, "learning_rate": 0.1, "loss": 2.2723007202148438, "step": 4810 }, { "epoch": 0.15276190476190477, "grad_norm": 0.328125, "learning_rate": 0.1, "loss": 2.259803533554077, "step": 4812 }, { "epoch": 0.15282539682539684, "grad_norm": 0.2314453125, "learning_rate": 0.1, "loss": 2.266575813293457, "step": 4814 }, { "epoch": 0.15288888888888888, "grad_norm": 0.05517578125, "learning_rate": 0.1, "loss": 2.26224946975708, "step": 4816 }, { "epoch": 0.15295238095238095, "grad_norm": 0.2099609375, "learning_rate": 0.1, "loss": 2.273287773132324, "step": 4818 }, { "epoch": 0.15301587301587302, "grad_norm": 0.154296875, "learning_rate": 0.1, "loss": 2.2544667720794678, "step": 4820 }, { "epoch": 0.1530793650793651, "grad_norm": 0.11669921875, "learning_rate": 0.1, "loss": 2.2758798599243164, "step": 4822 }, { "epoch": 0.15314285714285714, "grad_norm": 0.162109375, "learning_rate": 0.1, "loss": 2.2476959228515625, "step": 4824 }, { "epoch": 0.1532063492063492, "grad_norm": 0.134765625, "learning_rate": 0.1, "loss": 2.270059585571289, "step": 4826 }, { "epoch": 0.15326984126984128, "grad_norm": 0.26953125, "learning_rate": 0.1, "loss": 2.28082537651062, "step": 4828 }, { "epoch": 0.15333333333333332, "grad_norm": 0.1630859375, "learning_rate": 0.1, "loss": 2.2653164863586426, "step": 4830 }, { "epoch": 0.1533968253968254, "grad_norm": 0.0869140625, "learning_rate": 0.1, "loss": 2.2648427486419678, "step": 4832 }, { "epoch": 0.15346031746031746, "grad_norm": 0.193359375, "learning_rate": 0.1, "loss": 2.2725417613983154, "step": 4834 }, { "epoch": 0.15352380952380953, "grad_norm": 0.142578125, "learning_rate": 0.1, "loss": 2.2633843421936035, "step": 4836 }, { "epoch": 0.15358730158730158, "grad_norm": 0.16796875, "learning_rate": 0.1, "loss": 2.2554657459259033, "step": 4838 }, { "epoch": 0.15365079365079365, "grad_norm": 0.302734375, "learning_rate": 0.1, "loss": 2.284818172454834, "step": 4840 }, { "epoch": 0.15371428571428572, "grad_norm": 0.23828125, "learning_rate": 0.1, "loss": 2.2788193225860596, "step": 4842 }, { "epoch": 0.1537777777777778, "grad_norm": 0.06884765625, "learning_rate": 0.1, "loss": 2.2591826915740967, "step": 4844 }, { "epoch": 0.15384126984126983, "grad_norm": 0.1689453125, "learning_rate": 0.1, "loss": 2.2509100437164307, "step": 4846 }, { "epoch": 0.1539047619047619, "grad_norm": 0.0869140625, "learning_rate": 0.1, "loss": 2.2685470581054688, "step": 4848 }, { "epoch": 0.15396825396825398, "grad_norm": 0.16796875, "learning_rate": 0.1, "loss": 2.2778966426849365, "step": 4850 }, { "epoch": 0.15403174603174602, "grad_norm": 0.296875, "learning_rate": 0.1, "loss": 2.2525289058685303, "step": 4852 }, { "epoch": 0.1540952380952381, "grad_norm": 0.40625, "learning_rate": 0.1, "loss": 2.289371967315674, "step": 4854 }, { "epoch": 0.15415873015873016, "grad_norm": 0.1943359375, "learning_rate": 0.1, "loss": 2.271483898162842, "step": 4856 }, { "epoch": 0.15422222222222223, "grad_norm": 0.1396484375, "learning_rate": 0.1, "loss": 2.220386505126953, "step": 4858 }, { "epoch": 0.15428571428571428, "grad_norm": 0.2333984375, "learning_rate": 0.1, "loss": 2.2766268253326416, "step": 4860 }, { "epoch": 0.15434920634920635, "grad_norm": 0.1357421875, "learning_rate": 0.1, "loss": 2.268421173095703, "step": 4862 }, { "epoch": 0.15441269841269842, "grad_norm": 0.07763671875, "learning_rate": 0.1, "loss": 2.2657928466796875, "step": 4864 }, { "epoch": 0.1544761904761905, "grad_norm": 0.06591796875, "learning_rate": 0.1, "loss": 2.2780115604400635, "step": 4866 }, { "epoch": 0.15453968253968253, "grad_norm": 0.12109375, "learning_rate": 0.1, "loss": 2.286980152130127, "step": 4868 }, { "epoch": 0.1546031746031746, "grad_norm": 0.173828125, "learning_rate": 0.1, "loss": 2.244744300842285, "step": 4870 }, { "epoch": 0.15466666666666667, "grad_norm": 0.169921875, "learning_rate": 0.1, "loss": 2.2635936737060547, "step": 4872 }, { "epoch": 0.15473015873015872, "grad_norm": 0.189453125, "learning_rate": 0.1, "loss": 2.269031047821045, "step": 4874 }, { "epoch": 0.1547936507936508, "grad_norm": 0.1962890625, "learning_rate": 0.1, "loss": 2.2663118839263916, "step": 4876 }, { "epoch": 0.15485714285714286, "grad_norm": 0.2001953125, "learning_rate": 0.1, "loss": 2.265953779220581, "step": 4878 }, { "epoch": 0.15492063492063493, "grad_norm": 0.259765625, "learning_rate": 0.1, "loss": 2.247471332550049, "step": 4880 }, { "epoch": 0.15498412698412697, "grad_norm": 0.2080078125, "learning_rate": 0.1, "loss": 2.263579845428467, "step": 4882 }, { "epoch": 0.15504761904761905, "grad_norm": 0.2080078125, "learning_rate": 0.1, "loss": 2.2737841606140137, "step": 4884 }, { "epoch": 0.15511111111111112, "grad_norm": 0.0908203125, "learning_rate": 0.1, "loss": 2.294534206390381, "step": 4886 }, { "epoch": 0.1551746031746032, "grad_norm": 0.15234375, "learning_rate": 0.1, "loss": 2.277787685394287, "step": 4888 }, { "epoch": 0.15523809523809523, "grad_norm": 0.287109375, "learning_rate": 0.1, "loss": 2.2213211059570312, "step": 4890 }, { "epoch": 0.1553015873015873, "grad_norm": 0.275390625, "learning_rate": 0.1, "loss": 2.284451484680176, "step": 4892 }, { "epoch": 0.15536507936507937, "grad_norm": 0.1298828125, "learning_rate": 0.1, "loss": 2.288118839263916, "step": 4894 }, { "epoch": 0.15542857142857142, "grad_norm": 0.1513671875, "learning_rate": 0.1, "loss": 2.254192590713501, "step": 4896 }, { "epoch": 0.1554920634920635, "grad_norm": 0.12353515625, "learning_rate": 0.1, "loss": 2.2678418159484863, "step": 4898 }, { "epoch": 0.15555555555555556, "grad_norm": 0.1005859375, "learning_rate": 0.1, "loss": 2.2658965587615967, "step": 4900 }, { "epoch": 0.15561904761904763, "grad_norm": 0.1337890625, "learning_rate": 0.1, "loss": 2.2776918411254883, "step": 4902 }, { "epoch": 0.15568253968253967, "grad_norm": 0.26953125, "learning_rate": 0.1, "loss": 2.2595179080963135, "step": 4904 }, { "epoch": 0.15574603174603174, "grad_norm": 0.2431640625, "learning_rate": 0.1, "loss": 2.267033100128174, "step": 4906 }, { "epoch": 0.15580952380952381, "grad_norm": 0.2451171875, "learning_rate": 0.1, "loss": 2.2166504859924316, "step": 4908 }, { "epoch": 0.15587301587301589, "grad_norm": 0.28515625, "learning_rate": 0.1, "loss": 2.281113624572754, "step": 4910 }, { "epoch": 0.15593650793650793, "grad_norm": 0.1533203125, "learning_rate": 0.1, "loss": 2.259669542312622, "step": 4912 }, { "epoch": 0.156, "grad_norm": 0.083984375, "learning_rate": 0.1, "loss": 2.2450602054595947, "step": 4914 }, { "epoch": 0.15606349206349207, "grad_norm": 0.16796875, "learning_rate": 0.1, "loss": 2.253636360168457, "step": 4916 }, { "epoch": 0.15612698412698411, "grad_norm": 0.1337890625, "learning_rate": 0.1, "loss": 2.2610912322998047, "step": 4918 }, { "epoch": 0.15619047619047619, "grad_norm": 0.08740234375, "learning_rate": 0.1, "loss": 2.226576089859009, "step": 4920 }, { "epoch": 0.15625396825396826, "grad_norm": 0.1708984375, "learning_rate": 0.1, "loss": 2.2533984184265137, "step": 4922 }, { "epoch": 0.15631746031746033, "grad_norm": 0.06884765625, "learning_rate": 0.1, "loss": 2.253063440322876, "step": 4924 }, { "epoch": 0.15638095238095237, "grad_norm": 0.08251953125, "learning_rate": 0.1, "loss": 2.255126953125, "step": 4926 }, { "epoch": 0.15644444444444444, "grad_norm": 0.15234375, "learning_rate": 0.1, "loss": 2.258524179458618, "step": 4928 }, { "epoch": 0.1565079365079365, "grad_norm": 0.19921875, "learning_rate": 0.1, "loss": 2.245518684387207, "step": 4930 }, { "epoch": 0.15657142857142858, "grad_norm": 0.10986328125, "learning_rate": 0.1, "loss": 2.2489027976989746, "step": 4932 }, { "epoch": 0.15663492063492063, "grad_norm": 0.2099609375, "learning_rate": 0.1, "loss": 2.2570343017578125, "step": 4934 }, { "epoch": 0.1566984126984127, "grad_norm": 0.53515625, "learning_rate": 0.1, "loss": 2.2633161544799805, "step": 4936 }, { "epoch": 0.15676190476190477, "grad_norm": 0.09375, "learning_rate": 0.1, "loss": 2.2700204849243164, "step": 4938 }, { "epoch": 0.1568253968253968, "grad_norm": 0.1953125, "learning_rate": 0.1, "loss": 2.2659599781036377, "step": 4940 }, { "epoch": 0.15688888888888888, "grad_norm": 0.11181640625, "learning_rate": 0.1, "loss": 2.252875804901123, "step": 4942 }, { "epoch": 0.15695238095238095, "grad_norm": 0.1533203125, "learning_rate": 0.1, "loss": 2.239166021347046, "step": 4944 }, { "epoch": 0.15701587301587303, "grad_norm": 0.1953125, "learning_rate": 0.1, "loss": 2.2401669025421143, "step": 4946 }, { "epoch": 0.15707936507936507, "grad_norm": 0.1357421875, "learning_rate": 0.1, "loss": 2.2538061141967773, "step": 4948 }, { "epoch": 0.15714285714285714, "grad_norm": 0.07763671875, "learning_rate": 0.1, "loss": 2.253511428833008, "step": 4950 }, { "epoch": 0.1572063492063492, "grad_norm": 0.185546875, "learning_rate": 0.1, "loss": 2.257477045059204, "step": 4952 }, { "epoch": 0.15726984126984128, "grad_norm": 0.298828125, "learning_rate": 0.1, "loss": 2.2615082263946533, "step": 4954 }, { "epoch": 0.15733333333333333, "grad_norm": 0.2138671875, "learning_rate": 0.1, "loss": 2.2325854301452637, "step": 4956 }, { "epoch": 0.1573968253968254, "grad_norm": 0.0869140625, "learning_rate": 0.1, "loss": 2.2421278953552246, "step": 4958 }, { "epoch": 0.15746031746031747, "grad_norm": 0.169921875, "learning_rate": 0.1, "loss": 2.2368688583374023, "step": 4960 }, { "epoch": 0.1575238095238095, "grad_norm": 0.1533203125, "learning_rate": 0.1, "loss": 2.257746458053589, "step": 4962 }, { "epoch": 0.15758730158730158, "grad_norm": 0.07275390625, "learning_rate": 0.1, "loss": 2.2700254917144775, "step": 4964 }, { "epoch": 0.15765079365079365, "grad_norm": 0.0869140625, "learning_rate": 0.1, "loss": 2.2365431785583496, "step": 4966 }, { "epoch": 0.15771428571428572, "grad_norm": 0.130859375, "learning_rate": 0.1, "loss": 2.2262678146362305, "step": 4968 }, { "epoch": 0.15777777777777777, "grad_norm": 0.224609375, "learning_rate": 0.1, "loss": 2.2592132091522217, "step": 4970 }, { "epoch": 0.15784126984126984, "grad_norm": 0.130859375, "learning_rate": 0.1, "loss": 2.2403512001037598, "step": 4972 }, { "epoch": 0.1579047619047619, "grad_norm": 0.1298828125, "learning_rate": 0.1, "loss": 2.276103973388672, "step": 4974 }, { "epoch": 0.15796825396825398, "grad_norm": 0.2236328125, "learning_rate": 0.1, "loss": 2.2738139629364014, "step": 4976 }, { "epoch": 0.15803174603174602, "grad_norm": 0.291015625, "learning_rate": 0.1, "loss": 2.2761828899383545, "step": 4978 }, { "epoch": 0.1580952380952381, "grad_norm": 0.283203125, "learning_rate": 0.1, "loss": 2.2704248428344727, "step": 4980 }, { "epoch": 0.15815873015873017, "grad_norm": 0.14453125, "learning_rate": 0.1, "loss": 2.238813638687134, "step": 4982 }, { "epoch": 0.1582222222222222, "grad_norm": 0.18359375, "learning_rate": 0.1, "loss": 2.275257110595703, "step": 4984 }, { "epoch": 0.15828571428571428, "grad_norm": 0.279296875, "learning_rate": 0.1, "loss": 2.2381954193115234, "step": 4986 }, { "epoch": 0.15834920634920635, "grad_norm": 0.2890625, "learning_rate": 0.1, "loss": 2.2375190258026123, "step": 4988 }, { "epoch": 0.15841269841269842, "grad_norm": 0.154296875, "learning_rate": 0.1, "loss": 2.2449967861175537, "step": 4990 }, { "epoch": 0.15847619047619046, "grad_norm": 0.107421875, "learning_rate": 0.1, "loss": 2.2624034881591797, "step": 4992 }, { "epoch": 0.15853968253968254, "grad_norm": 0.10546875, "learning_rate": 0.1, "loss": 2.240557909011841, "step": 4994 }, { "epoch": 0.1586031746031746, "grad_norm": 0.080078125, "learning_rate": 0.1, "loss": 2.242172956466675, "step": 4996 }, { "epoch": 0.15866666666666668, "grad_norm": 0.357421875, "learning_rate": 0.1, "loss": 2.2322025299072266, "step": 4998 }, { "epoch": 0.15873015873015872, "grad_norm": 0.392578125, "learning_rate": 0.1, "loss": 2.2726588249206543, "step": 5000 }, { "epoch": 0.1587936507936508, "grad_norm": 0.1669921875, "learning_rate": 0.1, "loss": 2.237313985824585, "step": 5002 }, { "epoch": 0.15885714285714286, "grad_norm": 0.13671875, "learning_rate": 0.1, "loss": 2.2391881942749023, "step": 5004 }, { "epoch": 0.15892063492063493, "grad_norm": 0.07666015625, "learning_rate": 0.1, "loss": 2.2513434886932373, "step": 5006 }, { "epoch": 0.15898412698412698, "grad_norm": 0.119140625, "learning_rate": 0.1, "loss": 2.2696645259857178, "step": 5008 }, { "epoch": 0.15904761904761905, "grad_norm": 0.08447265625, "learning_rate": 0.1, "loss": 2.222923994064331, "step": 5010 }, { "epoch": 0.15911111111111112, "grad_norm": 0.06201171875, "learning_rate": 0.1, "loss": 2.2680246829986572, "step": 5012 }, { "epoch": 0.15917460317460316, "grad_norm": 0.154296875, "learning_rate": 0.1, "loss": 2.284579038619995, "step": 5014 }, { "epoch": 0.15923809523809523, "grad_norm": 0.10400390625, "learning_rate": 0.1, "loss": 2.2308084964752197, "step": 5016 }, { "epoch": 0.1593015873015873, "grad_norm": 0.1552734375, "learning_rate": 0.1, "loss": 2.283543109893799, "step": 5018 }, { "epoch": 0.15936507936507938, "grad_norm": 0.09521484375, "learning_rate": 0.1, "loss": 2.231330633163452, "step": 5020 }, { "epoch": 0.15942857142857142, "grad_norm": 0.330078125, "learning_rate": 0.1, "loss": 2.2461466789245605, "step": 5022 }, { "epoch": 0.1594920634920635, "grad_norm": 0.302734375, "learning_rate": 0.1, "loss": 2.2194268703460693, "step": 5024 }, { "epoch": 0.15955555555555556, "grad_norm": 0.07421875, "learning_rate": 0.1, "loss": 2.2064836025238037, "step": 5026 }, { "epoch": 0.15961904761904763, "grad_norm": 0.2294921875, "learning_rate": 0.1, "loss": 2.237323522567749, "step": 5028 }, { "epoch": 0.15968253968253968, "grad_norm": 0.10546875, "learning_rate": 0.1, "loss": 2.2373275756835938, "step": 5030 }, { "epoch": 0.15974603174603175, "grad_norm": 0.062255859375, "learning_rate": 0.1, "loss": 2.220587730407715, "step": 5032 }, { "epoch": 0.15980952380952382, "grad_norm": 0.1572265625, "learning_rate": 0.1, "loss": 2.2245140075683594, "step": 5034 }, { "epoch": 0.15987301587301586, "grad_norm": 0.1220703125, "learning_rate": 0.1, "loss": 2.24704909324646, "step": 5036 }, { "epoch": 0.15993650793650793, "grad_norm": 0.1650390625, "learning_rate": 0.1, "loss": 2.221097230911255, "step": 5038 }, { "epoch": 0.16, "grad_norm": 0.11767578125, "learning_rate": 0.1, "loss": 2.222346544265747, "step": 5040 }, { "epoch": 0.16006349206349207, "grad_norm": 0.083984375, "learning_rate": 0.1, "loss": 2.222827911376953, "step": 5042 }, { "epoch": 0.16012698412698412, "grad_norm": 0.13671875, "learning_rate": 0.1, "loss": 2.249751091003418, "step": 5044 }, { "epoch": 0.1601904761904762, "grad_norm": 0.298828125, "learning_rate": 0.1, "loss": 2.2378947734832764, "step": 5046 }, { "epoch": 0.16025396825396826, "grad_norm": 0.166015625, "learning_rate": 0.1, "loss": 2.2625467777252197, "step": 5048 }, { "epoch": 0.16031746031746033, "grad_norm": 0.1064453125, "learning_rate": 0.1, "loss": 2.245082378387451, "step": 5050 }, { "epoch": 0.16038095238095237, "grad_norm": 0.275390625, "learning_rate": 0.1, "loss": 2.2310752868652344, "step": 5052 }, { "epoch": 0.16044444444444445, "grad_norm": 0.330078125, "learning_rate": 0.1, "loss": 2.2522566318511963, "step": 5054 }, { "epoch": 0.16050793650793652, "grad_norm": 0.1435546875, "learning_rate": 0.1, "loss": 2.222249746322632, "step": 5056 }, { "epoch": 0.16057142857142856, "grad_norm": 0.181640625, "learning_rate": 0.1, "loss": 2.1979217529296875, "step": 5058 }, { "epoch": 0.16063492063492063, "grad_norm": 0.1767578125, "learning_rate": 0.1, "loss": 2.2300143241882324, "step": 5060 }, { "epoch": 0.1606984126984127, "grad_norm": 0.158203125, "learning_rate": 0.1, "loss": 2.240529775619507, "step": 5062 }, { "epoch": 0.16076190476190477, "grad_norm": 0.07958984375, "learning_rate": 0.1, "loss": 2.2198026180267334, "step": 5064 }, { "epoch": 0.16082539682539682, "grad_norm": 0.1259765625, "learning_rate": 0.1, "loss": 2.2088072299957275, "step": 5066 }, { "epoch": 0.1608888888888889, "grad_norm": 0.1259765625, "learning_rate": 0.1, "loss": 2.191087007522583, "step": 5068 }, { "epoch": 0.16095238095238096, "grad_norm": 0.11083984375, "learning_rate": 0.1, "loss": 2.2324159145355225, "step": 5070 }, { "epoch": 0.16101587301587303, "grad_norm": 0.171875, "learning_rate": 0.1, "loss": 2.222407341003418, "step": 5072 }, { "epoch": 0.16107936507936507, "grad_norm": 0.1357421875, "learning_rate": 0.1, "loss": 2.251791477203369, "step": 5074 }, { "epoch": 0.16114285714285714, "grad_norm": 0.150390625, "learning_rate": 0.1, "loss": 2.2332115173339844, "step": 5076 }, { "epoch": 0.16120634920634921, "grad_norm": 0.166015625, "learning_rate": 0.1, "loss": 2.2148568630218506, "step": 5078 }, { "epoch": 0.16126984126984126, "grad_norm": 0.09619140625, "learning_rate": 0.1, "loss": 2.2362165451049805, "step": 5080 }, { "epoch": 0.16133333333333333, "grad_norm": 0.107421875, "learning_rate": 0.1, "loss": 2.2186381816864014, "step": 5082 }, { "epoch": 0.1613968253968254, "grad_norm": 0.310546875, "learning_rate": 0.1, "loss": 2.2227377891540527, "step": 5084 }, { "epoch": 0.16146031746031747, "grad_norm": 0.306640625, "learning_rate": 0.1, "loss": 2.2452430725097656, "step": 5086 }, { "epoch": 0.16152380952380951, "grad_norm": 0.08056640625, "learning_rate": 0.1, "loss": 2.2232744693756104, "step": 5088 }, { "epoch": 0.16158730158730158, "grad_norm": 0.169921875, "learning_rate": 0.1, "loss": 2.2211923599243164, "step": 5090 }, { "epoch": 0.16165079365079366, "grad_norm": 0.34765625, "learning_rate": 0.1, "loss": 2.239081859588623, "step": 5092 }, { "epoch": 0.16171428571428573, "grad_norm": 0.2353515625, "learning_rate": 0.1, "loss": 2.2264490127563477, "step": 5094 }, { "epoch": 0.16177777777777777, "grad_norm": 0.1396484375, "learning_rate": 0.1, "loss": 2.2216522693634033, "step": 5096 }, { "epoch": 0.16184126984126984, "grad_norm": 0.22265625, "learning_rate": 0.1, "loss": 2.260937452316284, "step": 5098 }, { "epoch": 0.1619047619047619, "grad_norm": 0.189453125, "learning_rate": 0.1, "loss": 2.2209346294403076, "step": 5100 }, { "epoch": 0.16196825396825396, "grad_norm": 0.12060546875, "learning_rate": 0.1, "loss": 2.232752799987793, "step": 5102 }, { "epoch": 0.16203174603174603, "grad_norm": 0.177734375, "learning_rate": 0.1, "loss": 2.2323899269104004, "step": 5104 }, { "epoch": 0.1620952380952381, "grad_norm": 0.1513671875, "learning_rate": 0.1, "loss": 2.2197299003601074, "step": 5106 }, { "epoch": 0.16215873015873017, "grad_norm": 0.064453125, "learning_rate": 0.1, "loss": 2.254833221435547, "step": 5108 }, { "epoch": 0.1622222222222222, "grad_norm": 0.08837890625, "learning_rate": 0.1, "loss": 2.2183051109313965, "step": 5110 }, { "epoch": 0.16228571428571428, "grad_norm": 0.1298828125, "learning_rate": 0.1, "loss": 2.2214436531066895, "step": 5112 }, { "epoch": 0.16234920634920635, "grad_norm": 0.1826171875, "learning_rate": 0.1, "loss": 2.2467470169067383, "step": 5114 }, { "epoch": 0.16241269841269843, "grad_norm": 0.2734375, "learning_rate": 0.1, "loss": 2.2293200492858887, "step": 5116 }, { "epoch": 0.16247619047619047, "grad_norm": 0.15234375, "learning_rate": 0.1, "loss": 2.228208065032959, "step": 5118 }, { "epoch": 0.16253968253968254, "grad_norm": 0.296875, "learning_rate": 0.1, "loss": 2.2648887634277344, "step": 5120 }, { "epoch": 0.1626031746031746, "grad_norm": 0.1474609375, "learning_rate": 0.1, "loss": 2.211042881011963, "step": 5122 }, { "epoch": 0.16266666666666665, "grad_norm": 0.09228515625, "learning_rate": 0.1, "loss": 2.2363932132720947, "step": 5124 }, { "epoch": 0.16273015873015872, "grad_norm": 0.095703125, "learning_rate": 0.1, "loss": 2.208954095840454, "step": 5126 }, { "epoch": 0.1627936507936508, "grad_norm": 0.0673828125, "learning_rate": 0.1, "loss": 2.2209115028381348, "step": 5128 }, { "epoch": 0.16285714285714287, "grad_norm": 0.11865234375, "learning_rate": 0.1, "loss": 2.24106502532959, "step": 5130 }, { "epoch": 0.1629206349206349, "grad_norm": 0.216796875, "learning_rate": 0.1, "loss": 2.2115585803985596, "step": 5132 }, { "epoch": 0.16298412698412698, "grad_norm": 0.251953125, "learning_rate": 0.1, "loss": 2.2479374408721924, "step": 5134 }, { "epoch": 0.16304761904761905, "grad_norm": 0.1220703125, "learning_rate": 0.1, "loss": 2.2393784523010254, "step": 5136 }, { "epoch": 0.16311111111111112, "grad_norm": 0.059814453125, "learning_rate": 0.1, "loss": 2.250758171081543, "step": 5138 }, { "epoch": 0.16317460317460317, "grad_norm": 0.2138671875, "learning_rate": 0.1, "loss": 2.2317631244659424, "step": 5140 }, { "epoch": 0.16323809523809524, "grad_norm": 0.2734375, "learning_rate": 0.1, "loss": 2.2231292724609375, "step": 5142 }, { "epoch": 0.1633015873015873, "grad_norm": 0.19921875, "learning_rate": 0.1, "loss": 2.2032456398010254, "step": 5144 }, { "epoch": 0.16336507936507935, "grad_norm": 0.0712890625, "learning_rate": 0.1, "loss": 2.2521567344665527, "step": 5146 }, { "epoch": 0.16342857142857142, "grad_norm": 0.0830078125, "learning_rate": 0.1, "loss": 2.223768949508667, "step": 5148 }, { "epoch": 0.1634920634920635, "grad_norm": 0.1357421875, "learning_rate": 0.1, "loss": 2.2328879833221436, "step": 5150 }, { "epoch": 0.16355555555555557, "grad_norm": 0.126953125, "learning_rate": 0.1, "loss": 2.2296142578125, "step": 5152 }, { "epoch": 0.1636190476190476, "grad_norm": 0.26953125, "learning_rate": 0.1, "loss": 2.237793445587158, "step": 5154 }, { "epoch": 0.16368253968253968, "grad_norm": 0.404296875, "learning_rate": 0.1, "loss": 2.2338759899139404, "step": 5156 }, { "epoch": 0.16374603174603175, "grad_norm": 0.08056640625, "learning_rate": 0.1, "loss": 2.2009706497192383, "step": 5158 }, { "epoch": 0.16380952380952382, "grad_norm": 0.057373046875, "learning_rate": 0.1, "loss": 2.2130651473999023, "step": 5160 }, { "epoch": 0.16387301587301586, "grad_norm": 0.041748046875, "learning_rate": 0.1, "loss": 2.193004846572876, "step": 5162 }, { "epoch": 0.16393650793650794, "grad_norm": 0.1171875, "learning_rate": 0.1, "loss": 2.2272660732269287, "step": 5164 }, { "epoch": 0.164, "grad_norm": 0.150390625, "learning_rate": 0.1, "loss": 2.2205116748809814, "step": 5166 }, { "epoch": 0.16406349206349205, "grad_norm": 0.3828125, "learning_rate": 0.1, "loss": 2.2040321826934814, "step": 5168 }, { "epoch": 0.16412698412698412, "grad_norm": 0.298828125, "learning_rate": 0.1, "loss": 2.223106622695923, "step": 5170 }, { "epoch": 0.1641904761904762, "grad_norm": 0.1396484375, "learning_rate": 0.1, "loss": 2.2366268634796143, "step": 5172 }, { "epoch": 0.16425396825396826, "grad_norm": 0.11572265625, "learning_rate": 0.1, "loss": 2.221320629119873, "step": 5174 }, { "epoch": 0.1643174603174603, "grad_norm": 0.07666015625, "learning_rate": 0.1, "loss": 2.195066452026367, "step": 5176 }, { "epoch": 0.16438095238095238, "grad_norm": 0.08349609375, "learning_rate": 0.1, "loss": 2.233842134475708, "step": 5178 }, { "epoch": 0.16444444444444445, "grad_norm": 0.10595703125, "learning_rate": 0.1, "loss": 2.233755111694336, "step": 5180 }, { "epoch": 0.16450793650793652, "grad_norm": 0.146484375, "learning_rate": 0.1, "loss": 2.236267328262329, "step": 5182 }, { "epoch": 0.16457142857142856, "grad_norm": 0.158203125, "learning_rate": 0.1, "loss": 2.230855703353882, "step": 5184 }, { "epoch": 0.16463492063492063, "grad_norm": 0.341796875, "learning_rate": 0.1, "loss": 2.23653507232666, "step": 5186 }, { "epoch": 0.1646984126984127, "grad_norm": 0.322265625, "learning_rate": 0.1, "loss": 2.2414145469665527, "step": 5188 }, { "epoch": 0.16476190476190475, "grad_norm": 0.09228515625, "learning_rate": 0.1, "loss": 2.214081287384033, "step": 5190 }, { "epoch": 0.16482539682539682, "grad_norm": 0.0859375, "learning_rate": 0.1, "loss": 2.2288808822631836, "step": 5192 }, { "epoch": 0.1648888888888889, "grad_norm": 0.1455078125, "learning_rate": 0.1, "loss": 2.2191929817199707, "step": 5194 }, { "epoch": 0.16495238095238096, "grad_norm": 0.1552734375, "learning_rate": 0.1, "loss": 2.230496883392334, "step": 5196 }, { "epoch": 0.165015873015873, "grad_norm": 0.146484375, "learning_rate": 0.1, "loss": 2.188372850418091, "step": 5198 }, { "epoch": 0.16507936507936508, "grad_norm": 0.185546875, "learning_rate": 0.1, "loss": 2.212965726852417, "step": 5200 }, { "epoch": 0.16514285714285715, "grad_norm": 0.203125, "learning_rate": 0.1, "loss": 2.2081892490386963, "step": 5202 }, { "epoch": 0.16520634920634922, "grad_norm": 0.109375, "learning_rate": 0.1, "loss": 2.204195261001587, "step": 5204 }, { "epoch": 0.16526984126984126, "grad_norm": 0.12109375, "learning_rate": 0.1, "loss": 2.224276542663574, "step": 5206 }, { "epoch": 0.16533333333333333, "grad_norm": 0.185546875, "learning_rate": 0.1, "loss": 2.2126972675323486, "step": 5208 }, { "epoch": 0.1653968253968254, "grad_norm": 0.09912109375, "learning_rate": 0.1, "loss": 2.2141454219818115, "step": 5210 }, { "epoch": 0.16546031746031745, "grad_norm": 0.08349609375, "learning_rate": 0.1, "loss": 2.2428598403930664, "step": 5212 }, { "epoch": 0.16552380952380952, "grad_norm": 0.2216796875, "learning_rate": 0.1, "loss": 2.225158452987671, "step": 5214 }, { "epoch": 0.1655873015873016, "grad_norm": 0.3984375, "learning_rate": 0.1, "loss": 2.2441675662994385, "step": 5216 }, { "epoch": 0.16565079365079366, "grad_norm": 0.1357421875, "learning_rate": 0.1, "loss": 2.2186105251312256, "step": 5218 }, { "epoch": 0.1657142857142857, "grad_norm": 0.1474609375, "learning_rate": 0.1, "loss": 2.221276044845581, "step": 5220 }, { "epoch": 0.16577777777777777, "grad_norm": 0.08984375, "learning_rate": 0.1, "loss": 2.2159576416015625, "step": 5222 }, { "epoch": 0.16584126984126984, "grad_norm": 0.08154296875, "learning_rate": 0.1, "loss": 2.224595308303833, "step": 5224 }, { "epoch": 0.16590476190476192, "grad_norm": 0.06640625, "learning_rate": 0.1, "loss": 2.238022804260254, "step": 5226 }, { "epoch": 0.16596825396825396, "grad_norm": 0.1103515625, "learning_rate": 0.1, "loss": 2.2206733226776123, "step": 5228 }, { "epoch": 0.16603174603174603, "grad_norm": 0.1943359375, "learning_rate": 0.1, "loss": 2.232161521911621, "step": 5230 }, { "epoch": 0.1660952380952381, "grad_norm": 0.1064453125, "learning_rate": 0.1, "loss": 2.2291412353515625, "step": 5232 }, { "epoch": 0.16615873015873017, "grad_norm": 0.369140625, "learning_rate": 0.1, "loss": 2.218386650085449, "step": 5234 }, { "epoch": 0.16622222222222222, "grad_norm": 0.21484375, "learning_rate": 0.1, "loss": 2.245426654815674, "step": 5236 }, { "epoch": 0.1662857142857143, "grad_norm": 0.06982421875, "learning_rate": 0.1, "loss": 2.2098357677459717, "step": 5238 }, { "epoch": 0.16634920634920636, "grad_norm": 0.09912109375, "learning_rate": 0.1, "loss": 2.241814374923706, "step": 5240 }, { "epoch": 0.1664126984126984, "grad_norm": 0.1611328125, "learning_rate": 0.1, "loss": 2.2069361209869385, "step": 5242 }, { "epoch": 0.16647619047619047, "grad_norm": 0.109375, "learning_rate": 0.1, "loss": 2.2302141189575195, "step": 5244 }, { "epoch": 0.16653968253968254, "grad_norm": 0.220703125, "learning_rate": 0.1, "loss": 2.2223875522613525, "step": 5246 }, { "epoch": 0.16660317460317461, "grad_norm": 0.12451171875, "learning_rate": 0.1, "loss": 2.235405445098877, "step": 5248 }, { "epoch": 0.16666666666666666, "grad_norm": 0.09326171875, "learning_rate": 0.1, "loss": 2.188941717147827, "step": 5250 }, { "epoch": 0.16673015873015873, "grad_norm": 0.10498046875, "learning_rate": 0.1, "loss": 2.215752601623535, "step": 5252 }, { "epoch": 0.1667936507936508, "grad_norm": 0.08837890625, "learning_rate": 0.1, "loss": 2.2274723052978516, "step": 5254 }, { "epoch": 0.16685714285714287, "grad_norm": 0.1494140625, "learning_rate": 0.1, "loss": 2.228240489959717, "step": 5256 }, { "epoch": 0.1669206349206349, "grad_norm": 0.490234375, "learning_rate": 0.1, "loss": 2.2375991344451904, "step": 5258 }, { "epoch": 0.16698412698412698, "grad_norm": 0.09814453125, "learning_rate": 0.1, "loss": 2.212428092956543, "step": 5260 }, { "epoch": 0.16704761904761906, "grad_norm": 0.2080078125, "learning_rate": 0.1, "loss": 2.218977451324463, "step": 5262 }, { "epoch": 0.1671111111111111, "grad_norm": 0.1396484375, "learning_rate": 0.1, "loss": 2.2352826595306396, "step": 5264 }, { "epoch": 0.16717460317460317, "grad_norm": 0.06982421875, "learning_rate": 0.1, "loss": 2.2354068756103516, "step": 5266 }, { "epoch": 0.16723809523809524, "grad_norm": 0.1630859375, "learning_rate": 0.1, "loss": 2.235917329788208, "step": 5268 }, { "epoch": 0.1673015873015873, "grad_norm": 0.099609375, "learning_rate": 0.1, "loss": 2.2140448093414307, "step": 5270 }, { "epoch": 0.16736507936507936, "grad_norm": 0.0654296875, "learning_rate": 0.1, "loss": 2.2267987728118896, "step": 5272 }, { "epoch": 0.16742857142857143, "grad_norm": 0.091796875, "learning_rate": 0.1, "loss": 2.2129578590393066, "step": 5274 }, { "epoch": 0.1674920634920635, "grad_norm": 0.1484375, "learning_rate": 0.1, "loss": 2.2155518531799316, "step": 5276 }, { "epoch": 0.16755555555555557, "grad_norm": 0.30078125, "learning_rate": 0.1, "loss": 2.228393316268921, "step": 5278 }, { "epoch": 0.1676190476190476, "grad_norm": 0.220703125, "learning_rate": 0.1, "loss": 2.225015640258789, "step": 5280 }, { "epoch": 0.16768253968253968, "grad_norm": 0.162109375, "learning_rate": 0.1, "loss": 2.2127954959869385, "step": 5282 }, { "epoch": 0.16774603174603175, "grad_norm": 0.291015625, "learning_rate": 0.1, "loss": 2.2165286540985107, "step": 5284 }, { "epoch": 0.1678095238095238, "grad_norm": 0.2890625, "learning_rate": 0.1, "loss": 2.230541229248047, "step": 5286 }, { "epoch": 0.16787301587301587, "grad_norm": 0.291015625, "learning_rate": 0.1, "loss": 2.219632148742676, "step": 5288 }, { "epoch": 0.16793650793650794, "grad_norm": 0.1083984375, "learning_rate": 0.1, "loss": 2.2218093872070312, "step": 5290 }, { "epoch": 0.168, "grad_norm": 0.055908203125, "learning_rate": 0.1, "loss": 2.2170727252960205, "step": 5292 }, { "epoch": 0.16806349206349205, "grad_norm": 0.07177734375, "learning_rate": 0.1, "loss": 2.2225778102874756, "step": 5294 }, { "epoch": 0.16812698412698412, "grad_norm": 0.0517578125, "learning_rate": 0.1, "loss": 2.2472734451293945, "step": 5296 }, { "epoch": 0.1681904761904762, "grad_norm": 0.07666015625, "learning_rate": 0.1, "loss": 2.239201784133911, "step": 5298 }, { "epoch": 0.16825396825396827, "grad_norm": 0.1640625, "learning_rate": 0.1, "loss": 2.2380897998809814, "step": 5300 }, { "epoch": 0.1683174603174603, "grad_norm": 0.1484375, "learning_rate": 0.1, "loss": 2.2443320751190186, "step": 5302 }, { "epoch": 0.16838095238095238, "grad_norm": 0.1865234375, "learning_rate": 0.1, "loss": 2.201160430908203, "step": 5304 }, { "epoch": 0.16844444444444445, "grad_norm": 0.423828125, "learning_rate": 0.1, "loss": 2.2377402782440186, "step": 5306 }, { "epoch": 0.1685079365079365, "grad_norm": 0.224609375, "learning_rate": 0.1, "loss": 2.204244613647461, "step": 5308 }, { "epoch": 0.16857142857142857, "grad_norm": 0.107421875, "learning_rate": 0.1, "loss": 2.2251334190368652, "step": 5310 }, { "epoch": 0.16863492063492064, "grad_norm": 0.111328125, "learning_rate": 0.1, "loss": 2.226300001144409, "step": 5312 }, { "epoch": 0.1686984126984127, "grad_norm": 0.07177734375, "learning_rate": 0.1, "loss": 2.233757972717285, "step": 5314 }, { "epoch": 0.16876190476190475, "grad_norm": 0.140625, "learning_rate": 0.1, "loss": 2.2315447330474854, "step": 5316 }, { "epoch": 0.16882539682539682, "grad_norm": 0.1484375, "learning_rate": 0.1, "loss": 2.2387020587921143, "step": 5318 }, { "epoch": 0.1688888888888889, "grad_norm": 0.1513671875, "learning_rate": 0.1, "loss": 2.2487027645111084, "step": 5320 }, { "epoch": 0.16895238095238096, "grad_norm": 0.1845703125, "learning_rate": 0.1, "loss": 2.2567031383514404, "step": 5322 }, { "epoch": 0.169015873015873, "grad_norm": 0.1826171875, "learning_rate": 0.1, "loss": 2.262542486190796, "step": 5324 }, { "epoch": 0.16907936507936508, "grad_norm": 0.236328125, "learning_rate": 0.1, "loss": 2.27374529838562, "step": 5326 }, { "epoch": 0.16914285714285715, "grad_norm": 0.337890625, "learning_rate": 0.1, "loss": 2.208045721054077, "step": 5328 }, { "epoch": 0.1692063492063492, "grad_norm": 0.15234375, "learning_rate": 0.1, "loss": 2.234679698944092, "step": 5330 }, { "epoch": 0.16926984126984126, "grad_norm": 0.078125, "learning_rate": 0.1, "loss": 2.2552943229675293, "step": 5332 }, { "epoch": 0.16933333333333334, "grad_norm": 0.0810546875, "learning_rate": 0.1, "loss": 2.2510366439819336, "step": 5334 }, { "epoch": 0.1693968253968254, "grad_norm": 0.091796875, "learning_rate": 0.1, "loss": 2.252507209777832, "step": 5336 }, { "epoch": 0.16946031746031745, "grad_norm": 0.109375, "learning_rate": 0.1, "loss": 2.2388932704925537, "step": 5338 }, { "epoch": 0.16952380952380952, "grad_norm": 0.271484375, "learning_rate": 0.1, "loss": 2.206791877746582, "step": 5340 }, { "epoch": 0.1695873015873016, "grad_norm": 0.27734375, "learning_rate": 0.1, "loss": 2.2619879245758057, "step": 5342 }, { "epoch": 0.16965079365079366, "grad_norm": 0.09716796875, "learning_rate": 0.1, "loss": 2.236004114151001, "step": 5344 }, { "epoch": 0.1697142857142857, "grad_norm": 0.232421875, "learning_rate": 0.1, "loss": 2.237311601638794, "step": 5346 }, { "epoch": 0.16977777777777778, "grad_norm": 0.1357421875, "learning_rate": 0.1, "loss": 2.247586965560913, "step": 5348 }, { "epoch": 0.16984126984126985, "grad_norm": 0.11767578125, "learning_rate": 0.1, "loss": 2.229217052459717, "step": 5350 }, { "epoch": 0.1699047619047619, "grad_norm": 0.1298828125, "learning_rate": 0.1, "loss": 2.2378089427948, "step": 5352 }, { "epoch": 0.16996825396825396, "grad_norm": 0.1591796875, "learning_rate": 0.1, "loss": 2.2429304122924805, "step": 5354 }, { "epoch": 0.17003174603174603, "grad_norm": 0.263671875, "learning_rate": 0.1, "loss": 2.2513580322265625, "step": 5356 }, { "epoch": 0.1700952380952381, "grad_norm": 0.15625, "learning_rate": 0.1, "loss": 2.232398271560669, "step": 5358 }, { "epoch": 0.17015873015873015, "grad_norm": 0.2265625, "learning_rate": 0.1, "loss": 2.2437007427215576, "step": 5360 }, { "epoch": 0.17022222222222222, "grad_norm": 0.302734375, "learning_rate": 0.1, "loss": 2.239910364151001, "step": 5362 }, { "epoch": 0.1702857142857143, "grad_norm": 0.0595703125, "learning_rate": 0.1, "loss": 2.263826847076416, "step": 5364 }, { "epoch": 0.17034920634920636, "grad_norm": 0.1982421875, "learning_rate": 0.1, "loss": 2.236025810241699, "step": 5366 }, { "epoch": 0.1704126984126984, "grad_norm": 0.216796875, "learning_rate": 0.1, "loss": 2.2458367347717285, "step": 5368 }, { "epoch": 0.17047619047619048, "grad_norm": 0.08251953125, "learning_rate": 0.1, "loss": 2.1940269470214844, "step": 5370 }, { "epoch": 0.17053968253968255, "grad_norm": 0.1083984375, "learning_rate": 0.1, "loss": 2.238917112350464, "step": 5372 }, { "epoch": 0.1706031746031746, "grad_norm": 0.189453125, "learning_rate": 0.1, "loss": 2.2323081493377686, "step": 5374 }, { "epoch": 0.17066666666666666, "grad_norm": 0.1728515625, "learning_rate": 0.1, "loss": 2.2760086059570312, "step": 5376 }, { "epoch": 0.17073015873015873, "grad_norm": 0.14453125, "learning_rate": 0.1, "loss": 2.2591707706451416, "step": 5378 }, { "epoch": 0.1707936507936508, "grad_norm": 0.109375, "learning_rate": 0.1, "loss": 2.22514009475708, "step": 5380 }, { "epoch": 0.17085714285714285, "grad_norm": 0.146484375, "learning_rate": 0.1, "loss": 2.2517614364624023, "step": 5382 }, { "epoch": 0.17092063492063492, "grad_norm": 0.1416015625, "learning_rate": 0.1, "loss": 2.2130422592163086, "step": 5384 }, { "epoch": 0.170984126984127, "grad_norm": 0.21875, "learning_rate": 0.1, "loss": 2.237107753753662, "step": 5386 }, { "epoch": 0.17104761904761906, "grad_norm": 0.1884765625, "learning_rate": 0.1, "loss": 2.22249436378479, "step": 5388 }, { "epoch": 0.1711111111111111, "grad_norm": 0.2890625, "learning_rate": 0.1, "loss": 2.2083210945129395, "step": 5390 }, { "epoch": 0.17117460317460317, "grad_norm": 0.173828125, "learning_rate": 0.1, "loss": 2.2472527027130127, "step": 5392 }, { "epoch": 0.17123809523809524, "grad_norm": 0.0625, "learning_rate": 0.1, "loss": 2.2192068099975586, "step": 5394 }, { "epoch": 0.1713015873015873, "grad_norm": 0.21875, "learning_rate": 0.1, "loss": 2.228910446166992, "step": 5396 }, { "epoch": 0.17136507936507936, "grad_norm": 0.22265625, "learning_rate": 0.1, "loss": 2.2466044425964355, "step": 5398 }, { "epoch": 0.17142857142857143, "grad_norm": 0.2451171875, "learning_rate": 0.1, "loss": 2.2460954189300537, "step": 5400 }, { "epoch": 0.1714920634920635, "grad_norm": 0.279296875, "learning_rate": 0.1, "loss": 2.2400946617126465, "step": 5402 }, { "epoch": 0.17155555555555554, "grad_norm": 0.11865234375, "learning_rate": 0.1, "loss": 2.228113889694214, "step": 5404 }, { "epoch": 0.17161904761904762, "grad_norm": 0.05517578125, "learning_rate": 0.1, "loss": 2.250746488571167, "step": 5406 }, { "epoch": 0.1716825396825397, "grad_norm": 0.1611328125, "learning_rate": 0.1, "loss": 2.2206196784973145, "step": 5408 }, { "epoch": 0.17174603174603176, "grad_norm": 0.310546875, "learning_rate": 0.1, "loss": 2.218111753463745, "step": 5410 }, { "epoch": 0.1718095238095238, "grad_norm": 0.28125, "learning_rate": 0.1, "loss": 2.239234209060669, "step": 5412 }, { "epoch": 0.17187301587301587, "grad_norm": 0.185546875, "learning_rate": 0.1, "loss": 2.238288640975952, "step": 5414 }, { "epoch": 0.17193650793650794, "grad_norm": 0.16015625, "learning_rate": 0.1, "loss": 2.247776985168457, "step": 5416 }, { "epoch": 0.172, "grad_norm": 0.1044921875, "learning_rate": 0.1, "loss": 2.2185373306274414, "step": 5418 }, { "epoch": 0.17206349206349206, "grad_norm": 0.1435546875, "learning_rate": 0.1, "loss": 2.2487704753875732, "step": 5420 }, { "epoch": 0.17212698412698413, "grad_norm": 0.169921875, "learning_rate": 0.1, "loss": 2.230968952178955, "step": 5422 }, { "epoch": 0.1721904761904762, "grad_norm": 0.1240234375, "learning_rate": 0.1, "loss": 2.240570545196533, "step": 5424 }, { "epoch": 0.17225396825396824, "grad_norm": 0.154296875, "learning_rate": 0.1, "loss": 2.2375564575195312, "step": 5426 }, { "epoch": 0.1723174603174603, "grad_norm": 0.392578125, "learning_rate": 0.1, "loss": 2.2417004108428955, "step": 5428 }, { "epoch": 0.17238095238095238, "grad_norm": 0.1806640625, "learning_rate": 0.1, "loss": 2.248905658721924, "step": 5430 }, { "epoch": 0.17244444444444446, "grad_norm": 0.1083984375, "learning_rate": 0.1, "loss": 2.224895477294922, "step": 5432 }, { "epoch": 0.1725079365079365, "grad_norm": 0.091796875, "learning_rate": 0.1, "loss": 2.246554374694824, "step": 5434 }, { "epoch": 0.17257142857142857, "grad_norm": 0.09033203125, "learning_rate": 0.1, "loss": 2.2528350353240967, "step": 5436 }, { "epoch": 0.17263492063492064, "grad_norm": 0.055908203125, "learning_rate": 0.1, "loss": 2.248203992843628, "step": 5438 }, { "epoch": 0.1726984126984127, "grad_norm": 0.0751953125, "learning_rate": 0.1, "loss": 2.2348220348358154, "step": 5440 }, { "epoch": 0.17276190476190476, "grad_norm": 0.2314453125, "learning_rate": 0.1, "loss": 2.259739875793457, "step": 5442 }, { "epoch": 0.17282539682539683, "grad_norm": 0.212890625, "learning_rate": 0.1, "loss": 2.2497034072875977, "step": 5444 }, { "epoch": 0.1728888888888889, "grad_norm": 0.059326171875, "learning_rate": 0.1, "loss": 2.239690065383911, "step": 5446 }, { "epoch": 0.17295238095238094, "grad_norm": 0.051025390625, "learning_rate": 0.1, "loss": 2.2197043895721436, "step": 5448 }, { "epoch": 0.173015873015873, "grad_norm": 0.3125, "learning_rate": 0.1, "loss": 2.2467963695526123, "step": 5450 }, { "epoch": 0.17307936507936508, "grad_norm": 0.375, "learning_rate": 0.1, "loss": 2.250375270843506, "step": 5452 }, { "epoch": 0.17314285714285715, "grad_norm": 0.16015625, "learning_rate": 0.1, "loss": 2.271305799484253, "step": 5454 }, { "epoch": 0.1732063492063492, "grad_norm": 0.10986328125, "learning_rate": 0.1, "loss": 2.2277557849884033, "step": 5456 }, { "epoch": 0.17326984126984127, "grad_norm": 0.07861328125, "learning_rate": 0.1, "loss": 2.246593952178955, "step": 5458 }, { "epoch": 0.17333333333333334, "grad_norm": 0.0791015625, "learning_rate": 0.1, "loss": 2.228313684463501, "step": 5460 }, { "epoch": 0.1733968253968254, "grad_norm": 0.11865234375, "learning_rate": 0.1, "loss": 2.215380907058716, "step": 5462 }, { "epoch": 0.17346031746031745, "grad_norm": 0.1669921875, "learning_rate": 0.1, "loss": 2.2452847957611084, "step": 5464 }, { "epoch": 0.17352380952380952, "grad_norm": 0.1005859375, "learning_rate": 0.1, "loss": 2.249474048614502, "step": 5466 }, { "epoch": 0.1735873015873016, "grad_norm": 0.0703125, "learning_rate": 0.1, "loss": 2.2301065921783447, "step": 5468 }, { "epoch": 0.17365079365079364, "grad_norm": 0.1025390625, "learning_rate": 0.1, "loss": 2.232131242752075, "step": 5470 }, { "epoch": 0.1737142857142857, "grad_norm": 0.061767578125, "learning_rate": 0.1, "loss": 2.234830856323242, "step": 5472 }, { "epoch": 0.17377777777777778, "grad_norm": 0.263671875, "learning_rate": 0.1, "loss": 2.220759391784668, "step": 5474 }, { "epoch": 0.17384126984126985, "grad_norm": 0.5078125, "learning_rate": 0.1, "loss": 2.254563093185425, "step": 5476 }, { "epoch": 0.1739047619047619, "grad_norm": 0.09326171875, "learning_rate": 0.1, "loss": 2.2214040756225586, "step": 5478 }, { "epoch": 0.17396825396825397, "grad_norm": 0.0625, "learning_rate": 0.1, "loss": 2.2419307231903076, "step": 5480 }, { "epoch": 0.17403174603174604, "grad_norm": 0.130859375, "learning_rate": 0.1, "loss": 2.240987777709961, "step": 5482 }, { "epoch": 0.1740952380952381, "grad_norm": 0.0810546875, "learning_rate": 0.1, "loss": 2.2481160163879395, "step": 5484 }, { "epoch": 0.17415873015873015, "grad_norm": 0.10888671875, "learning_rate": 0.1, "loss": 2.2311184406280518, "step": 5486 }, { "epoch": 0.17422222222222222, "grad_norm": 0.1279296875, "learning_rate": 0.1, "loss": 2.252633571624756, "step": 5488 }, { "epoch": 0.1742857142857143, "grad_norm": 0.09765625, "learning_rate": 0.1, "loss": 2.2562689781188965, "step": 5490 }, { "epoch": 0.17434920634920634, "grad_norm": 0.244140625, "learning_rate": 0.1, "loss": 2.2305996417999268, "step": 5492 }, { "epoch": 0.1744126984126984, "grad_norm": 0.40234375, "learning_rate": 0.1, "loss": 2.228957414627075, "step": 5494 }, { "epoch": 0.17447619047619048, "grad_norm": 0.07568359375, "learning_rate": 0.1, "loss": 2.2482094764709473, "step": 5496 }, { "epoch": 0.17453968253968255, "grad_norm": 0.15234375, "learning_rate": 0.1, "loss": 2.2103452682495117, "step": 5498 }, { "epoch": 0.1746031746031746, "grad_norm": 0.2265625, "learning_rate": 0.1, "loss": 2.243201732635498, "step": 5500 }, { "epoch": 0.17466666666666666, "grad_norm": 0.314453125, "learning_rate": 0.1, "loss": 2.2494349479675293, "step": 5502 }, { "epoch": 0.17473015873015874, "grad_norm": 0.2109375, "learning_rate": 0.1, "loss": 2.2386648654937744, "step": 5504 }, { "epoch": 0.1747936507936508, "grad_norm": 0.18359375, "learning_rate": 0.1, "loss": 2.2504119873046875, "step": 5506 }, { "epoch": 0.17485714285714285, "grad_norm": 0.146484375, "learning_rate": 0.1, "loss": 2.2377138137817383, "step": 5508 }, { "epoch": 0.17492063492063492, "grad_norm": 0.0888671875, "learning_rate": 0.1, "loss": 2.21343994140625, "step": 5510 }, { "epoch": 0.174984126984127, "grad_norm": 0.1875, "learning_rate": 0.1, "loss": 2.24652099609375, "step": 5512 }, { "epoch": 0.17504761904761904, "grad_norm": 0.2021484375, "learning_rate": 0.1, "loss": 2.2008094787597656, "step": 5514 }, { "epoch": 0.1751111111111111, "grad_norm": 0.1552734375, "learning_rate": 0.1, "loss": 2.219883441925049, "step": 5516 }, { "epoch": 0.17517460317460318, "grad_norm": 0.091796875, "learning_rate": 0.1, "loss": 2.2402069568634033, "step": 5518 }, { "epoch": 0.17523809523809525, "grad_norm": 0.25390625, "learning_rate": 0.1, "loss": 2.2644662857055664, "step": 5520 }, { "epoch": 0.1753015873015873, "grad_norm": 0.1162109375, "learning_rate": 0.1, "loss": 2.2146880626678467, "step": 5522 }, { "epoch": 0.17536507936507936, "grad_norm": 0.173828125, "learning_rate": 0.1, "loss": 2.2293951511383057, "step": 5524 }, { "epoch": 0.17542857142857143, "grad_norm": 0.212890625, "learning_rate": 0.1, "loss": 2.239231586456299, "step": 5526 }, { "epoch": 0.1754920634920635, "grad_norm": 0.0791015625, "learning_rate": 0.1, "loss": 2.212488889694214, "step": 5528 }, { "epoch": 0.17555555555555555, "grad_norm": 0.19140625, "learning_rate": 0.1, "loss": 2.2368595600128174, "step": 5530 }, { "epoch": 0.17561904761904762, "grad_norm": 0.2265625, "learning_rate": 0.1, "loss": 2.244206428527832, "step": 5532 }, { "epoch": 0.1756825396825397, "grad_norm": 0.177734375, "learning_rate": 0.1, "loss": 2.2008259296417236, "step": 5534 }, { "epoch": 0.17574603174603173, "grad_norm": 0.107421875, "learning_rate": 0.1, "loss": 2.210707664489746, "step": 5536 }, { "epoch": 0.1758095238095238, "grad_norm": 0.1005859375, "learning_rate": 0.1, "loss": 2.222578763961792, "step": 5538 }, { "epoch": 0.17587301587301588, "grad_norm": 0.06201171875, "learning_rate": 0.1, "loss": 2.2187490463256836, "step": 5540 }, { "epoch": 0.17593650793650795, "grad_norm": 0.13671875, "learning_rate": 0.1, "loss": 2.2362513542175293, "step": 5542 }, { "epoch": 0.176, "grad_norm": 0.271484375, "learning_rate": 0.1, "loss": 2.2371938228607178, "step": 5544 }, { "epoch": 0.17606349206349206, "grad_norm": 0.34375, "learning_rate": 0.1, "loss": 2.2370688915252686, "step": 5546 }, { "epoch": 0.17612698412698413, "grad_norm": 0.1533203125, "learning_rate": 0.1, "loss": 2.2207376956939697, "step": 5548 }, { "epoch": 0.1761904761904762, "grad_norm": 0.08251953125, "learning_rate": 0.1, "loss": 2.227200746536255, "step": 5550 }, { "epoch": 0.17625396825396825, "grad_norm": 0.1455078125, "learning_rate": 0.1, "loss": 2.182065963745117, "step": 5552 }, { "epoch": 0.17631746031746032, "grad_norm": 0.0693359375, "learning_rate": 0.1, "loss": 2.2325737476348877, "step": 5554 }, { "epoch": 0.1763809523809524, "grad_norm": 0.109375, "learning_rate": 0.1, "loss": 2.2160959243774414, "step": 5556 }, { "epoch": 0.17644444444444443, "grad_norm": 0.291015625, "learning_rate": 0.1, "loss": 2.222426176071167, "step": 5558 }, { "epoch": 0.1765079365079365, "grad_norm": 0.18359375, "learning_rate": 0.1, "loss": 2.225956678390503, "step": 5560 }, { "epoch": 0.17657142857142857, "grad_norm": 0.0966796875, "learning_rate": 0.1, "loss": 2.220018148422241, "step": 5562 }, { "epoch": 0.17663492063492064, "grad_norm": 0.177734375, "learning_rate": 0.1, "loss": 2.242483615875244, "step": 5564 }, { "epoch": 0.1766984126984127, "grad_norm": 0.0859375, "learning_rate": 0.1, "loss": 2.24456787109375, "step": 5566 }, { "epoch": 0.17676190476190476, "grad_norm": 0.203125, "learning_rate": 0.1, "loss": 2.2262074947357178, "step": 5568 }, { "epoch": 0.17682539682539683, "grad_norm": 0.408203125, "learning_rate": 0.1, "loss": 2.2252190113067627, "step": 5570 }, { "epoch": 0.1768888888888889, "grad_norm": 0.396484375, "learning_rate": 0.1, "loss": 2.225132942199707, "step": 5572 }, { "epoch": 0.17695238095238094, "grad_norm": 0.1611328125, "learning_rate": 0.1, "loss": 2.2306530475616455, "step": 5574 }, { "epoch": 0.17701587301587302, "grad_norm": 0.10498046875, "learning_rate": 0.1, "loss": 2.245706081390381, "step": 5576 }, { "epoch": 0.1770793650793651, "grad_norm": 0.1875, "learning_rate": 0.1, "loss": 2.236727714538574, "step": 5578 }, { "epoch": 0.17714285714285713, "grad_norm": 0.10498046875, "learning_rate": 0.1, "loss": 2.2132623195648193, "step": 5580 }, { "epoch": 0.1772063492063492, "grad_norm": 0.078125, "learning_rate": 0.1, "loss": 2.2096645832061768, "step": 5582 }, { "epoch": 0.17726984126984127, "grad_norm": 0.154296875, "learning_rate": 0.1, "loss": 2.227487802505493, "step": 5584 }, { "epoch": 0.17733333333333334, "grad_norm": 0.2431640625, "learning_rate": 0.1, "loss": 2.2216334342956543, "step": 5586 }, { "epoch": 0.17739682539682539, "grad_norm": 0.11083984375, "learning_rate": 0.1, "loss": 2.221672534942627, "step": 5588 }, { "epoch": 0.17746031746031746, "grad_norm": 0.103515625, "learning_rate": 0.1, "loss": 2.2232348918914795, "step": 5590 }, { "epoch": 0.17752380952380953, "grad_norm": 0.05224609375, "learning_rate": 0.1, "loss": 2.225348711013794, "step": 5592 }, { "epoch": 0.1775873015873016, "grad_norm": 0.2001953125, "learning_rate": 0.1, "loss": 2.2116899490356445, "step": 5594 }, { "epoch": 0.17765079365079364, "grad_norm": 0.2109375, "learning_rate": 0.1, "loss": 2.242435932159424, "step": 5596 }, { "epoch": 0.1777142857142857, "grad_norm": 0.1884765625, "learning_rate": 0.1, "loss": 2.2130982875823975, "step": 5598 }, { "epoch": 0.17777777777777778, "grad_norm": 0.193359375, "learning_rate": 0.1, "loss": 2.2269206047058105, "step": 5600 }, { "epoch": 0.17784126984126983, "grad_norm": 0.15625, "learning_rate": 0.1, "loss": 2.2175040245056152, "step": 5602 }, { "epoch": 0.1779047619047619, "grad_norm": 0.09375, "learning_rate": 0.1, "loss": 2.231715202331543, "step": 5604 }, { "epoch": 0.17796825396825397, "grad_norm": 0.16015625, "learning_rate": 0.1, "loss": 2.2198948860168457, "step": 5606 }, { "epoch": 0.17803174603174604, "grad_norm": 0.061767578125, "learning_rate": 0.1, "loss": 2.2029759883880615, "step": 5608 }, { "epoch": 0.17809523809523808, "grad_norm": 0.1494140625, "learning_rate": 0.1, "loss": 2.2406513690948486, "step": 5610 }, { "epoch": 0.17815873015873016, "grad_norm": 0.25390625, "learning_rate": 0.1, "loss": 2.2332863807678223, "step": 5612 }, { "epoch": 0.17822222222222223, "grad_norm": 0.12109375, "learning_rate": 0.1, "loss": 2.194247245788574, "step": 5614 }, { "epoch": 0.1782857142857143, "grad_norm": 0.1552734375, "learning_rate": 0.1, "loss": 2.20070743560791, "step": 5616 }, { "epoch": 0.17834920634920634, "grad_norm": 0.255859375, "learning_rate": 0.1, "loss": 2.2152843475341797, "step": 5618 }, { "epoch": 0.1784126984126984, "grad_norm": 0.08984375, "learning_rate": 0.1, "loss": 2.219028949737549, "step": 5620 }, { "epoch": 0.17847619047619048, "grad_norm": 0.1669921875, "learning_rate": 0.1, "loss": 2.2304656505584717, "step": 5622 }, { "epoch": 0.17853968253968253, "grad_norm": 0.11767578125, "learning_rate": 0.1, "loss": 2.2375073432922363, "step": 5624 }, { "epoch": 0.1786031746031746, "grad_norm": 0.28125, "learning_rate": 0.1, "loss": 2.2103054523468018, "step": 5626 }, { "epoch": 0.17866666666666667, "grad_norm": 0.375, "learning_rate": 0.1, "loss": 2.1941580772399902, "step": 5628 }, { "epoch": 0.17873015873015874, "grad_norm": 0.14453125, "learning_rate": 0.1, "loss": 2.18294620513916, "step": 5630 }, { "epoch": 0.17879365079365078, "grad_norm": 0.1650390625, "learning_rate": 0.1, "loss": 2.2393198013305664, "step": 5632 }, { "epoch": 0.17885714285714285, "grad_norm": 0.07421875, "learning_rate": 0.1, "loss": 2.2223222255706787, "step": 5634 }, { "epoch": 0.17892063492063492, "grad_norm": 0.212890625, "learning_rate": 0.1, "loss": 2.1994831562042236, "step": 5636 }, { "epoch": 0.178984126984127, "grad_norm": 0.228515625, "learning_rate": 0.1, "loss": 2.2053911685943604, "step": 5638 }, { "epoch": 0.17904761904761904, "grad_norm": 0.12353515625, "learning_rate": 0.1, "loss": 2.2221360206604004, "step": 5640 }, { "epoch": 0.1791111111111111, "grad_norm": 0.193359375, "learning_rate": 0.1, "loss": 2.2110371589660645, "step": 5642 }, { "epoch": 0.17917460317460318, "grad_norm": 0.103515625, "learning_rate": 0.1, "loss": 2.243471384048462, "step": 5644 }, { "epoch": 0.17923809523809525, "grad_norm": 0.048583984375, "learning_rate": 0.1, "loss": 2.2105486392974854, "step": 5646 }, { "epoch": 0.1793015873015873, "grad_norm": 0.03564453125, "learning_rate": 0.1, "loss": 2.2213072776794434, "step": 5648 }, { "epoch": 0.17936507936507937, "grad_norm": 0.150390625, "learning_rate": 0.1, "loss": 2.196397304534912, "step": 5650 }, { "epoch": 0.17942857142857144, "grad_norm": 0.216796875, "learning_rate": 0.1, "loss": 2.203707695007324, "step": 5652 }, { "epoch": 0.17949206349206348, "grad_norm": 0.1884765625, "learning_rate": 0.1, "loss": 2.241105794906616, "step": 5654 }, { "epoch": 0.17955555555555555, "grad_norm": 0.16015625, "learning_rate": 0.1, "loss": 2.2121849060058594, "step": 5656 }, { "epoch": 0.17961904761904762, "grad_norm": 0.27734375, "learning_rate": 0.1, "loss": 2.198814868927002, "step": 5658 }, { "epoch": 0.1796825396825397, "grad_norm": 0.18359375, "learning_rate": 0.1, "loss": 2.218815565109253, "step": 5660 }, { "epoch": 0.17974603174603174, "grad_norm": 0.1962890625, "learning_rate": 0.1, "loss": 2.1849653720855713, "step": 5662 }, { "epoch": 0.1798095238095238, "grad_norm": 0.048583984375, "learning_rate": 0.1, "loss": 2.1933465003967285, "step": 5664 }, { "epoch": 0.17987301587301588, "grad_norm": 0.09716796875, "learning_rate": 0.1, "loss": 2.19207501411438, "step": 5666 }, { "epoch": 0.17993650793650795, "grad_norm": 0.1962890625, "learning_rate": 0.1, "loss": 2.205329656600952, "step": 5668 }, { "epoch": 0.18, "grad_norm": 0.1923828125, "learning_rate": 0.1, "loss": 2.1657142639160156, "step": 5670 }, { "epoch": 0.18006349206349206, "grad_norm": 0.056884765625, "learning_rate": 0.1, "loss": 2.1952130794525146, "step": 5672 }, { "epoch": 0.18012698412698414, "grad_norm": 0.1884765625, "learning_rate": 0.1, "loss": 2.204857110977173, "step": 5674 }, { "epoch": 0.18019047619047618, "grad_norm": 0.3125, "learning_rate": 0.1, "loss": 2.183090925216675, "step": 5676 }, { "epoch": 0.18025396825396825, "grad_norm": 0.0947265625, "learning_rate": 0.1, "loss": 2.180849075317383, "step": 5678 }, { "epoch": 0.18031746031746032, "grad_norm": 0.06982421875, "learning_rate": 0.1, "loss": 2.192868232727051, "step": 5680 }, { "epoch": 0.1803809523809524, "grad_norm": 0.2001953125, "learning_rate": 0.1, "loss": 2.208024024963379, "step": 5682 }, { "epoch": 0.18044444444444444, "grad_norm": 0.158203125, "learning_rate": 0.1, "loss": 2.190525531768799, "step": 5684 }, { "epoch": 0.1805079365079365, "grad_norm": 0.09765625, "learning_rate": 0.1, "loss": 2.1682043075561523, "step": 5686 }, { "epoch": 0.18057142857142858, "grad_norm": 0.19921875, "learning_rate": 0.1, "loss": 2.1999316215515137, "step": 5688 }, { "epoch": 0.18063492063492065, "grad_norm": 0.251953125, "learning_rate": 0.1, "loss": 2.212862491607666, "step": 5690 }, { "epoch": 0.1806984126984127, "grad_norm": 0.3125, "learning_rate": 0.1, "loss": 2.247584819793701, "step": 5692 }, { "epoch": 0.18076190476190476, "grad_norm": 0.115234375, "learning_rate": 0.1, "loss": 2.200183391571045, "step": 5694 }, { "epoch": 0.18082539682539683, "grad_norm": 0.076171875, "learning_rate": 0.1, "loss": 2.2044291496276855, "step": 5696 }, { "epoch": 0.18088888888888888, "grad_norm": 0.1044921875, "learning_rate": 0.1, "loss": 2.207892894744873, "step": 5698 }, { "epoch": 0.18095238095238095, "grad_norm": 0.1796875, "learning_rate": 0.1, "loss": 2.232621669769287, "step": 5700 }, { "epoch": 0.18101587301587302, "grad_norm": 0.11083984375, "learning_rate": 0.1, "loss": 2.2103846073150635, "step": 5702 }, { "epoch": 0.1810793650793651, "grad_norm": 0.1552734375, "learning_rate": 0.1, "loss": 2.213853597640991, "step": 5704 }, { "epoch": 0.18114285714285713, "grad_norm": 0.34375, "learning_rate": 0.1, "loss": 2.216942071914673, "step": 5706 }, { "epoch": 0.1812063492063492, "grad_norm": 0.1796875, "learning_rate": 0.1, "loss": 2.2333099842071533, "step": 5708 }, { "epoch": 0.18126984126984128, "grad_norm": 0.0703125, "learning_rate": 0.1, "loss": 2.1842305660247803, "step": 5710 }, { "epoch": 0.18133333333333335, "grad_norm": 0.2041015625, "learning_rate": 0.1, "loss": 2.208873987197876, "step": 5712 }, { "epoch": 0.1813968253968254, "grad_norm": 0.185546875, "learning_rate": 0.1, "loss": 2.175048828125, "step": 5714 }, { "epoch": 0.18146031746031746, "grad_norm": 0.265625, "learning_rate": 0.1, "loss": 2.222341537475586, "step": 5716 }, { "epoch": 0.18152380952380953, "grad_norm": 0.1640625, "learning_rate": 0.1, "loss": 2.185990571975708, "step": 5718 }, { "epoch": 0.18158730158730158, "grad_norm": 0.1416015625, "learning_rate": 0.1, "loss": 2.2164700031280518, "step": 5720 }, { "epoch": 0.18165079365079365, "grad_norm": 0.30078125, "learning_rate": 0.1, "loss": 2.1864469051361084, "step": 5722 }, { "epoch": 0.18171428571428572, "grad_norm": 0.28515625, "learning_rate": 0.1, "loss": 2.203237295150757, "step": 5724 }, { "epoch": 0.1817777777777778, "grad_norm": 0.29296875, "learning_rate": 0.1, "loss": 2.2073922157287598, "step": 5726 }, { "epoch": 0.18184126984126983, "grad_norm": 0.1259765625, "learning_rate": 0.1, "loss": 2.2267463207244873, "step": 5728 }, { "epoch": 0.1819047619047619, "grad_norm": 0.1875, "learning_rate": 0.1, "loss": 2.2018516063690186, "step": 5730 }, { "epoch": 0.18196825396825397, "grad_norm": 0.0576171875, "learning_rate": 0.1, "loss": 2.1903536319732666, "step": 5732 }, { "epoch": 0.18203174603174604, "grad_norm": 0.103515625, "learning_rate": 0.1, "loss": 2.1910316944122314, "step": 5734 }, { "epoch": 0.1820952380952381, "grad_norm": 0.078125, "learning_rate": 0.1, "loss": 2.194910764694214, "step": 5736 }, { "epoch": 0.18215873015873016, "grad_norm": 0.06982421875, "learning_rate": 0.1, "loss": 2.1729824542999268, "step": 5738 }, { "epoch": 0.18222222222222223, "grad_norm": 0.1328125, "learning_rate": 0.1, "loss": 2.2001962661743164, "step": 5740 }, { "epoch": 0.18228571428571427, "grad_norm": 0.0908203125, "learning_rate": 0.1, "loss": 2.214423179626465, "step": 5742 }, { "epoch": 0.18234920634920634, "grad_norm": 0.10986328125, "learning_rate": 0.1, "loss": 2.2019081115722656, "step": 5744 }, { "epoch": 0.18241269841269842, "grad_norm": 0.2265625, "learning_rate": 0.1, "loss": 2.194897413253784, "step": 5746 }, { "epoch": 0.1824761904761905, "grad_norm": 0.0654296875, "learning_rate": 0.1, "loss": 2.1907033920288086, "step": 5748 }, { "epoch": 0.18253968253968253, "grad_norm": 0.21484375, "learning_rate": 0.1, "loss": 2.188567876815796, "step": 5750 }, { "epoch": 0.1826031746031746, "grad_norm": 0.248046875, "learning_rate": 0.1, "loss": 2.206991672515869, "step": 5752 }, { "epoch": 0.18266666666666667, "grad_norm": 0.1640625, "learning_rate": 0.1, "loss": 2.2200136184692383, "step": 5754 }, { "epoch": 0.18273015873015874, "grad_norm": 0.07568359375, "learning_rate": 0.1, "loss": 2.189734935760498, "step": 5756 }, { "epoch": 0.18279365079365079, "grad_norm": 0.09814453125, "learning_rate": 0.1, "loss": 2.197354793548584, "step": 5758 }, { "epoch": 0.18285714285714286, "grad_norm": 0.259765625, "learning_rate": 0.1, "loss": 2.2094507217407227, "step": 5760 }, { "epoch": 0.18292063492063493, "grad_norm": 0.328125, "learning_rate": 0.1, "loss": 2.2090041637420654, "step": 5762 }, { "epoch": 0.18298412698412697, "grad_norm": 0.04052734375, "learning_rate": 0.1, "loss": 2.2302446365356445, "step": 5764 }, { "epoch": 0.18304761904761904, "grad_norm": 0.08154296875, "learning_rate": 0.1, "loss": 2.216928482055664, "step": 5766 }, { "epoch": 0.1831111111111111, "grad_norm": 0.2041015625, "learning_rate": 0.1, "loss": 2.1856963634490967, "step": 5768 }, { "epoch": 0.18317460317460318, "grad_norm": 0.212890625, "learning_rate": 0.1, "loss": 2.2101776599884033, "step": 5770 }, { "epoch": 0.18323809523809523, "grad_norm": 0.4140625, "learning_rate": 0.1, "loss": 2.179563045501709, "step": 5772 }, { "epoch": 0.1833015873015873, "grad_norm": 0.375, "learning_rate": 0.1, "loss": 2.19616436958313, "step": 5774 }, { "epoch": 0.18336507936507937, "grad_norm": 0.205078125, "learning_rate": 0.1, "loss": 2.2268223762512207, "step": 5776 }, { "epoch": 0.18342857142857144, "grad_norm": 0.08984375, "learning_rate": 0.1, "loss": 2.1700973510742188, "step": 5778 }, { "epoch": 0.18349206349206348, "grad_norm": 0.08642578125, "learning_rate": 0.1, "loss": 2.2083561420440674, "step": 5780 }, { "epoch": 0.18355555555555556, "grad_norm": 0.14453125, "learning_rate": 0.1, "loss": 2.21079683303833, "step": 5782 }, { "epoch": 0.18361904761904763, "grad_norm": 0.150390625, "learning_rate": 0.1, "loss": 2.1987245082855225, "step": 5784 }, { "epoch": 0.18368253968253967, "grad_norm": 0.138671875, "learning_rate": 0.1, "loss": 2.193941831588745, "step": 5786 }, { "epoch": 0.18374603174603174, "grad_norm": 0.103515625, "learning_rate": 0.1, "loss": 2.22224497795105, "step": 5788 }, { "epoch": 0.1838095238095238, "grad_norm": 0.07861328125, "learning_rate": 0.1, "loss": 2.2203209400177, "step": 5790 }, { "epoch": 0.18387301587301588, "grad_norm": 0.0654296875, "learning_rate": 0.1, "loss": 2.1991546154022217, "step": 5792 }, { "epoch": 0.18393650793650793, "grad_norm": 0.2373046875, "learning_rate": 0.1, "loss": 2.186033010482788, "step": 5794 }, { "epoch": 0.184, "grad_norm": 0.07763671875, "learning_rate": 0.1, "loss": 2.20918869972229, "step": 5796 }, { "epoch": 0.18406349206349207, "grad_norm": 0.25, "learning_rate": 0.1, "loss": 2.2011537551879883, "step": 5798 }, { "epoch": 0.18412698412698414, "grad_norm": 0.33203125, "learning_rate": 0.1, "loss": 2.1989431381225586, "step": 5800 }, { "epoch": 0.18419047619047618, "grad_norm": 0.076171875, "learning_rate": 0.1, "loss": 2.2051258087158203, "step": 5802 }, { "epoch": 0.18425396825396825, "grad_norm": 0.08544921875, "learning_rate": 0.1, "loss": 2.1982829570770264, "step": 5804 }, { "epoch": 0.18431746031746032, "grad_norm": 0.076171875, "learning_rate": 0.1, "loss": 2.205430269241333, "step": 5806 }, { "epoch": 0.18438095238095237, "grad_norm": 0.1171875, "learning_rate": 0.1, "loss": 2.1905219554901123, "step": 5808 }, { "epoch": 0.18444444444444444, "grad_norm": 0.1376953125, "learning_rate": 0.1, "loss": 2.1872129440307617, "step": 5810 }, { "epoch": 0.1845079365079365, "grad_norm": 0.2060546875, "learning_rate": 0.1, "loss": 2.2040278911590576, "step": 5812 }, { "epoch": 0.18457142857142858, "grad_norm": 0.1689453125, "learning_rate": 0.1, "loss": 2.201448917388916, "step": 5814 }, { "epoch": 0.18463492063492062, "grad_norm": 0.0751953125, "learning_rate": 0.1, "loss": 2.2140965461730957, "step": 5816 }, { "epoch": 0.1846984126984127, "grad_norm": 0.1904296875, "learning_rate": 0.1, "loss": 2.2147927284240723, "step": 5818 }, { "epoch": 0.18476190476190477, "grad_norm": 0.1767578125, "learning_rate": 0.1, "loss": 2.1852879524230957, "step": 5820 }, { "epoch": 0.18482539682539684, "grad_norm": 0.0830078125, "learning_rate": 0.1, "loss": 2.2019779682159424, "step": 5822 }, { "epoch": 0.18488888888888888, "grad_norm": 0.1845703125, "learning_rate": 0.1, "loss": 2.2206811904907227, "step": 5824 }, { "epoch": 0.18495238095238095, "grad_norm": 0.2060546875, "learning_rate": 0.1, "loss": 2.208799362182617, "step": 5826 }, { "epoch": 0.18501587301587302, "grad_norm": 0.2314453125, "learning_rate": 0.1, "loss": 2.2051916122436523, "step": 5828 }, { "epoch": 0.18507936507936507, "grad_norm": 0.279296875, "learning_rate": 0.1, "loss": 2.2022573947906494, "step": 5830 }, { "epoch": 0.18514285714285714, "grad_norm": 0.05908203125, "learning_rate": 0.1, "loss": 2.229438304901123, "step": 5832 }, { "epoch": 0.1852063492063492, "grad_norm": 0.17578125, "learning_rate": 0.1, "loss": 2.183858633041382, "step": 5834 }, { "epoch": 0.18526984126984128, "grad_norm": 0.107421875, "learning_rate": 0.1, "loss": 2.203674077987671, "step": 5836 }, { "epoch": 0.18533333333333332, "grad_norm": 0.054931640625, "learning_rate": 0.1, "loss": 2.2084474563598633, "step": 5838 }, { "epoch": 0.1853968253968254, "grad_norm": 0.1533203125, "learning_rate": 0.1, "loss": 2.2422432899475098, "step": 5840 }, { "epoch": 0.18546031746031746, "grad_norm": 0.259765625, "learning_rate": 0.1, "loss": 2.220038890838623, "step": 5842 }, { "epoch": 0.18552380952380954, "grad_norm": 0.2119140625, "learning_rate": 0.1, "loss": 2.211641788482666, "step": 5844 }, { "epoch": 0.18558730158730158, "grad_norm": 0.1123046875, "learning_rate": 0.1, "loss": 2.2078566551208496, "step": 5846 }, { "epoch": 0.18565079365079365, "grad_norm": 0.1533203125, "learning_rate": 0.1, "loss": 2.2220401763916016, "step": 5848 }, { "epoch": 0.18571428571428572, "grad_norm": 0.08203125, "learning_rate": 0.1, "loss": 2.2400717735290527, "step": 5850 }, { "epoch": 0.18577777777777776, "grad_norm": 0.2099609375, "learning_rate": 0.1, "loss": 2.221632480621338, "step": 5852 }, { "epoch": 0.18584126984126983, "grad_norm": 0.193359375, "learning_rate": 0.1, "loss": 2.223074436187744, "step": 5854 }, { "epoch": 0.1859047619047619, "grad_norm": 0.15625, "learning_rate": 0.1, "loss": 2.1982650756835938, "step": 5856 }, { "epoch": 0.18596825396825398, "grad_norm": 0.337890625, "learning_rate": 0.1, "loss": 2.201390266418457, "step": 5858 }, { "epoch": 0.18603174603174602, "grad_norm": 0.490234375, "learning_rate": 0.1, "loss": 2.1870415210723877, "step": 5860 }, { "epoch": 0.1860952380952381, "grad_norm": 0.1796875, "learning_rate": 0.1, "loss": 2.203367233276367, "step": 5862 }, { "epoch": 0.18615873015873016, "grad_norm": 0.05859375, "learning_rate": 0.1, "loss": 2.196864604949951, "step": 5864 }, { "epoch": 0.18622222222222223, "grad_norm": 0.064453125, "learning_rate": 0.1, "loss": 2.2113149166107178, "step": 5866 }, { "epoch": 0.18628571428571428, "grad_norm": 0.1279296875, "learning_rate": 0.1, "loss": 2.213573932647705, "step": 5868 }, { "epoch": 0.18634920634920635, "grad_norm": 0.1689453125, "learning_rate": 0.1, "loss": 2.213055372238159, "step": 5870 }, { "epoch": 0.18641269841269842, "grad_norm": 0.171875, "learning_rate": 0.1, "loss": 2.1899914741516113, "step": 5872 }, { "epoch": 0.1864761904761905, "grad_norm": 0.12060546875, "learning_rate": 0.1, "loss": 2.2117714881896973, "step": 5874 }, { "epoch": 0.18653968253968253, "grad_norm": 0.0712890625, "learning_rate": 0.1, "loss": 2.1852095127105713, "step": 5876 }, { "epoch": 0.1866031746031746, "grad_norm": 0.10546875, "learning_rate": 0.1, "loss": 2.210883617401123, "step": 5878 }, { "epoch": 0.18666666666666668, "grad_norm": 0.119140625, "learning_rate": 0.1, "loss": 2.1825766563415527, "step": 5880 }, { "epoch": 0.18673015873015872, "grad_norm": 0.287109375, "learning_rate": 0.1, "loss": 2.2093746662139893, "step": 5882 }, { "epoch": 0.1867936507936508, "grad_norm": 0.12353515625, "learning_rate": 0.1, "loss": 2.1918880939483643, "step": 5884 }, { "epoch": 0.18685714285714286, "grad_norm": 0.054931640625, "learning_rate": 0.1, "loss": 2.213242530822754, "step": 5886 }, { "epoch": 0.18692063492063493, "grad_norm": 0.095703125, "learning_rate": 0.1, "loss": 2.216320753097534, "step": 5888 }, { "epoch": 0.18698412698412697, "grad_norm": 0.2099609375, "learning_rate": 0.1, "loss": 2.1979331970214844, "step": 5890 }, { "epoch": 0.18704761904761905, "grad_norm": 0.265625, "learning_rate": 0.1, "loss": 2.2045342922210693, "step": 5892 }, { "epoch": 0.18711111111111112, "grad_norm": 0.1787109375, "learning_rate": 0.1, "loss": 2.184213638305664, "step": 5894 }, { "epoch": 0.1871746031746032, "grad_norm": 0.09765625, "learning_rate": 0.1, "loss": 2.185830593109131, "step": 5896 }, { "epoch": 0.18723809523809523, "grad_norm": 0.234375, "learning_rate": 0.1, "loss": 2.189467668533325, "step": 5898 }, { "epoch": 0.1873015873015873, "grad_norm": 0.42578125, "learning_rate": 0.1, "loss": 2.2115163803100586, "step": 5900 }, { "epoch": 0.18736507936507937, "grad_norm": 0.162109375, "learning_rate": 0.1, "loss": 2.2494029998779297, "step": 5902 }, { "epoch": 0.18742857142857142, "grad_norm": 0.130859375, "learning_rate": 0.1, "loss": 2.1923651695251465, "step": 5904 }, { "epoch": 0.1874920634920635, "grad_norm": 0.099609375, "learning_rate": 0.1, "loss": 2.173976182937622, "step": 5906 }, { "epoch": 0.18755555555555556, "grad_norm": 0.119140625, "learning_rate": 0.1, "loss": 2.1540393829345703, "step": 5908 }, { "epoch": 0.18761904761904763, "grad_norm": 0.25390625, "learning_rate": 0.1, "loss": 2.186767816543579, "step": 5910 }, { "epoch": 0.18768253968253967, "grad_norm": 0.265625, "learning_rate": 0.1, "loss": 2.190272569656372, "step": 5912 }, { "epoch": 0.18774603174603174, "grad_norm": 0.060302734375, "learning_rate": 0.1, "loss": 2.2462987899780273, "step": 5914 }, { "epoch": 0.18780952380952382, "grad_norm": 0.051025390625, "learning_rate": 0.1, "loss": 2.196441888809204, "step": 5916 }, { "epoch": 0.1878730158730159, "grad_norm": 0.08642578125, "learning_rate": 0.1, "loss": 2.1785659790039062, "step": 5918 }, { "epoch": 0.18793650793650793, "grad_norm": 0.09716796875, "learning_rate": 0.1, "loss": 2.1975674629211426, "step": 5920 }, { "epoch": 0.188, "grad_norm": 0.1396484375, "learning_rate": 0.1, "loss": 2.1792964935302734, "step": 5922 }, { "epoch": 0.18806349206349207, "grad_norm": 0.2333984375, "learning_rate": 0.1, "loss": 2.1812171936035156, "step": 5924 }, { "epoch": 0.18812698412698411, "grad_norm": 0.1572265625, "learning_rate": 0.1, "loss": 2.1953885555267334, "step": 5926 }, { "epoch": 0.18819047619047619, "grad_norm": 0.142578125, "learning_rate": 0.1, "loss": 2.1588587760925293, "step": 5928 }, { "epoch": 0.18825396825396826, "grad_norm": 0.314453125, "learning_rate": 0.1, "loss": 2.1766257286071777, "step": 5930 }, { "epoch": 0.18831746031746033, "grad_norm": 0.2578125, "learning_rate": 0.1, "loss": 2.1973955631256104, "step": 5932 }, { "epoch": 0.18838095238095237, "grad_norm": 0.11474609375, "learning_rate": 0.1, "loss": 2.1848061084747314, "step": 5934 }, { "epoch": 0.18844444444444444, "grad_norm": 0.12060546875, "learning_rate": 0.1, "loss": 2.161389112472534, "step": 5936 }, { "epoch": 0.1885079365079365, "grad_norm": 0.138671875, "learning_rate": 0.1, "loss": 2.1671247482299805, "step": 5938 }, { "epoch": 0.18857142857142858, "grad_norm": 0.154296875, "learning_rate": 0.1, "loss": 2.1756482124328613, "step": 5940 }, { "epoch": 0.18863492063492063, "grad_norm": 0.166015625, "learning_rate": 0.1, "loss": 2.2175707817077637, "step": 5942 }, { "epoch": 0.1886984126984127, "grad_norm": 0.2099609375, "learning_rate": 0.1, "loss": 2.2014639377593994, "step": 5944 }, { "epoch": 0.18876190476190477, "grad_norm": 0.08740234375, "learning_rate": 0.1, "loss": 2.195308208465576, "step": 5946 }, { "epoch": 0.1888253968253968, "grad_norm": 0.0888671875, "learning_rate": 0.1, "loss": 2.2031569480895996, "step": 5948 }, { "epoch": 0.18888888888888888, "grad_norm": 0.07373046875, "learning_rate": 0.1, "loss": 2.169421911239624, "step": 5950 }, { "epoch": 0.18895238095238095, "grad_norm": 0.23046875, "learning_rate": 0.1, "loss": 2.1925461292266846, "step": 5952 }, { "epoch": 0.18901587301587303, "grad_norm": 0.296875, "learning_rate": 0.1, "loss": 2.1873817443847656, "step": 5954 }, { "epoch": 0.18907936507936507, "grad_norm": 0.095703125, "learning_rate": 0.1, "loss": 2.1858420372009277, "step": 5956 }, { "epoch": 0.18914285714285714, "grad_norm": 0.2158203125, "learning_rate": 0.1, "loss": 2.1795644760131836, "step": 5958 }, { "epoch": 0.1892063492063492, "grad_norm": 0.27734375, "learning_rate": 0.1, "loss": 2.1950314044952393, "step": 5960 }, { "epoch": 0.18926984126984128, "grad_norm": 0.048095703125, "learning_rate": 0.1, "loss": 2.1723906993865967, "step": 5962 }, { "epoch": 0.18933333333333333, "grad_norm": 0.10986328125, "learning_rate": 0.1, "loss": 2.179588794708252, "step": 5964 }, { "epoch": 0.1893968253968254, "grad_norm": 0.087890625, "learning_rate": 0.1, "loss": 2.196774482727051, "step": 5966 }, { "epoch": 0.18946031746031747, "grad_norm": 0.365234375, "learning_rate": 0.1, "loss": 2.1812829971313477, "step": 5968 }, { "epoch": 0.1895238095238095, "grad_norm": 0.396484375, "learning_rate": 0.1, "loss": 2.1847035884857178, "step": 5970 }, { "epoch": 0.18958730158730158, "grad_norm": 0.099609375, "learning_rate": 0.1, "loss": 2.1821630001068115, "step": 5972 }, { "epoch": 0.18965079365079365, "grad_norm": 0.064453125, "learning_rate": 0.1, "loss": 2.1656107902526855, "step": 5974 }, { "epoch": 0.18971428571428572, "grad_norm": 0.06591796875, "learning_rate": 0.1, "loss": 2.1990864276885986, "step": 5976 }, { "epoch": 0.18977777777777777, "grad_norm": 0.2373046875, "learning_rate": 0.1, "loss": 2.1667439937591553, "step": 5978 }, { "epoch": 0.18984126984126984, "grad_norm": 0.1376953125, "learning_rate": 0.1, "loss": 2.1831586360931396, "step": 5980 }, { "epoch": 0.1899047619047619, "grad_norm": 0.171875, "learning_rate": 0.1, "loss": 2.1964125633239746, "step": 5982 }, { "epoch": 0.18996825396825398, "grad_norm": 0.054443359375, "learning_rate": 0.1, "loss": 2.1974003314971924, "step": 5984 }, { "epoch": 0.19003174603174602, "grad_norm": 0.130859375, "learning_rate": 0.1, "loss": 2.201986312866211, "step": 5986 }, { "epoch": 0.1900952380952381, "grad_norm": 0.1953125, "learning_rate": 0.1, "loss": 2.1796505451202393, "step": 5988 }, { "epoch": 0.19015873015873017, "grad_norm": 0.091796875, "learning_rate": 0.1, "loss": 2.191772699356079, "step": 5990 }, { "epoch": 0.1902222222222222, "grad_norm": 0.06396484375, "learning_rate": 0.1, "loss": 2.1828956604003906, "step": 5992 }, { "epoch": 0.19028571428571428, "grad_norm": 0.130859375, "learning_rate": 0.1, "loss": 2.180651903152466, "step": 5994 }, { "epoch": 0.19034920634920635, "grad_norm": 0.24609375, "learning_rate": 0.1, "loss": 2.149620532989502, "step": 5996 }, { "epoch": 0.19041269841269842, "grad_norm": 0.341796875, "learning_rate": 0.1, "loss": 2.1642372608184814, "step": 5998 }, { "epoch": 0.19047619047619047, "grad_norm": 0.1826171875, "learning_rate": 0.1, "loss": 2.1501424312591553, "step": 6000 }, { "epoch": 0.19053968253968254, "grad_norm": 0.1328125, "learning_rate": 0.1, "loss": 2.1779870986938477, "step": 6002 }, { "epoch": 0.1906031746031746, "grad_norm": 0.11181640625, "learning_rate": 0.1, "loss": 2.155672788619995, "step": 6004 }, { "epoch": 0.19066666666666668, "grad_norm": 0.10888671875, "learning_rate": 0.1, "loss": 2.174104928970337, "step": 6006 }, { "epoch": 0.19073015873015872, "grad_norm": 0.07080078125, "learning_rate": 0.1, "loss": 2.158841609954834, "step": 6008 }, { "epoch": 0.1907936507936508, "grad_norm": 0.171875, "learning_rate": 0.1, "loss": 2.1967852115631104, "step": 6010 }, { "epoch": 0.19085714285714286, "grad_norm": 0.375, "learning_rate": 0.1, "loss": 2.1663150787353516, "step": 6012 }, { "epoch": 0.1909206349206349, "grad_norm": 0.2177734375, "learning_rate": 0.1, "loss": 2.173383951187134, "step": 6014 }, { "epoch": 0.19098412698412698, "grad_norm": 0.28125, "learning_rate": 0.1, "loss": 2.175143241882324, "step": 6016 }, { "epoch": 0.19104761904761905, "grad_norm": 0.115234375, "learning_rate": 0.1, "loss": 2.213446617126465, "step": 6018 }, { "epoch": 0.19111111111111112, "grad_norm": 0.1005859375, "learning_rate": 0.1, "loss": 2.1765313148498535, "step": 6020 }, { "epoch": 0.19117460317460316, "grad_norm": 0.1103515625, "learning_rate": 0.1, "loss": 2.2055962085723877, "step": 6022 }, { "epoch": 0.19123809523809523, "grad_norm": 0.0830078125, "learning_rate": 0.1, "loss": 2.1716954708099365, "step": 6024 }, { "epoch": 0.1913015873015873, "grad_norm": 0.061767578125, "learning_rate": 0.1, "loss": 2.179086923599243, "step": 6026 }, { "epoch": 0.19136507936507938, "grad_norm": 0.208984375, "learning_rate": 0.1, "loss": 2.2210729122161865, "step": 6028 }, { "epoch": 0.19142857142857142, "grad_norm": 0.1865234375, "learning_rate": 0.1, "loss": 2.1864676475524902, "step": 6030 }, { "epoch": 0.1914920634920635, "grad_norm": 0.1748046875, "learning_rate": 0.1, "loss": 2.1944093704223633, "step": 6032 }, { "epoch": 0.19155555555555556, "grad_norm": 0.171875, "learning_rate": 0.1, "loss": 2.1865828037261963, "step": 6034 }, { "epoch": 0.1916190476190476, "grad_norm": 0.1484375, "learning_rate": 0.1, "loss": 2.195376396179199, "step": 6036 }, { "epoch": 0.19168253968253968, "grad_norm": 0.09033203125, "learning_rate": 0.1, "loss": 2.188965082168579, "step": 6038 }, { "epoch": 0.19174603174603175, "grad_norm": 0.1533203125, "learning_rate": 0.1, "loss": 2.163719415664673, "step": 6040 }, { "epoch": 0.19180952380952382, "grad_norm": 0.21875, "learning_rate": 0.1, "loss": 2.198596954345703, "step": 6042 }, { "epoch": 0.19187301587301586, "grad_norm": 0.33984375, "learning_rate": 0.1, "loss": 2.194344997406006, "step": 6044 }, { "epoch": 0.19193650793650793, "grad_norm": 0.2158203125, "learning_rate": 0.1, "loss": 2.1846604347229004, "step": 6046 }, { "epoch": 0.192, "grad_norm": 0.1513671875, "learning_rate": 0.1, "loss": 2.194336414337158, "step": 6048 }, { "epoch": 0.19206349206349208, "grad_norm": 0.107421875, "learning_rate": 0.1, "loss": 2.1704046726226807, "step": 6050 }, { "epoch": 0.19212698412698412, "grad_norm": 0.0791015625, "learning_rate": 0.1, "loss": 2.190962553024292, "step": 6052 }, { "epoch": 0.1921904761904762, "grad_norm": 0.158203125, "learning_rate": 0.1, "loss": 2.2271621227264404, "step": 6054 }, { "epoch": 0.19225396825396826, "grad_norm": 0.2265625, "learning_rate": 0.1, "loss": 2.1558127403259277, "step": 6056 }, { "epoch": 0.1923174603174603, "grad_norm": 0.1767578125, "learning_rate": 0.1, "loss": 2.1877310276031494, "step": 6058 }, { "epoch": 0.19238095238095237, "grad_norm": 0.0732421875, "learning_rate": 0.1, "loss": 2.215204954147339, "step": 6060 }, { "epoch": 0.19244444444444445, "grad_norm": 0.1630859375, "learning_rate": 0.1, "loss": 2.192199468612671, "step": 6062 }, { "epoch": 0.19250793650793652, "grad_norm": 0.2265625, "learning_rate": 0.1, "loss": 2.198889970779419, "step": 6064 }, { "epoch": 0.19257142857142856, "grad_norm": 0.1455078125, "learning_rate": 0.1, "loss": 2.220089912414551, "step": 6066 }, { "epoch": 0.19263492063492063, "grad_norm": 0.04443359375, "learning_rate": 0.1, "loss": 2.1915431022644043, "step": 6068 }, { "epoch": 0.1926984126984127, "grad_norm": 0.28515625, "learning_rate": 0.1, "loss": 2.176743268966675, "step": 6070 }, { "epoch": 0.19276190476190477, "grad_norm": 0.453125, "learning_rate": 0.1, "loss": 2.192582845687866, "step": 6072 }, { "epoch": 0.19282539682539682, "grad_norm": 0.0810546875, "learning_rate": 0.1, "loss": 2.1824567317962646, "step": 6074 }, { "epoch": 0.1928888888888889, "grad_norm": 0.11083984375, "learning_rate": 0.1, "loss": 2.1832454204559326, "step": 6076 }, { "epoch": 0.19295238095238096, "grad_norm": 0.1845703125, "learning_rate": 0.1, "loss": 2.1691694259643555, "step": 6078 }, { "epoch": 0.19301587301587303, "grad_norm": 0.1044921875, "learning_rate": 0.1, "loss": 2.1502773761749268, "step": 6080 }, { "epoch": 0.19307936507936507, "grad_norm": 0.134765625, "learning_rate": 0.1, "loss": 2.1799919605255127, "step": 6082 }, { "epoch": 0.19314285714285714, "grad_norm": 0.458984375, "learning_rate": 0.1, "loss": 2.1952779293060303, "step": 6084 }, { "epoch": 0.19320634920634921, "grad_norm": 0.2060546875, "learning_rate": 0.1, "loss": 2.158310651779175, "step": 6086 }, { "epoch": 0.19326984126984126, "grad_norm": 0.1015625, "learning_rate": 0.1, "loss": 2.197373867034912, "step": 6088 }, { "epoch": 0.19333333333333333, "grad_norm": 0.0634765625, "learning_rate": 0.1, "loss": 2.1505024433135986, "step": 6090 }, { "epoch": 0.1933968253968254, "grad_norm": 0.041015625, "learning_rate": 0.1, "loss": 2.1889684200286865, "step": 6092 }, { "epoch": 0.19346031746031747, "grad_norm": 0.12451171875, "learning_rate": 0.1, "loss": 2.170407295227051, "step": 6094 }, { "epoch": 0.19352380952380951, "grad_norm": 0.0859375, "learning_rate": 0.1, "loss": 2.1834094524383545, "step": 6096 }, { "epoch": 0.19358730158730159, "grad_norm": 0.1044921875, "learning_rate": 0.1, "loss": 2.205393075942993, "step": 6098 }, { "epoch": 0.19365079365079366, "grad_norm": 0.142578125, "learning_rate": 0.1, "loss": 2.196199893951416, "step": 6100 }, { "epoch": 0.19371428571428573, "grad_norm": 0.1142578125, "learning_rate": 0.1, "loss": 2.196040153503418, "step": 6102 }, { "epoch": 0.19377777777777777, "grad_norm": 0.10693359375, "learning_rate": 0.1, "loss": 2.1663010120391846, "step": 6104 }, { "epoch": 0.19384126984126984, "grad_norm": 0.08203125, "learning_rate": 0.1, "loss": 2.1630356311798096, "step": 6106 }, { "epoch": 0.1939047619047619, "grad_norm": 0.220703125, "learning_rate": 0.1, "loss": 2.1755616664886475, "step": 6108 }, { "epoch": 0.19396825396825396, "grad_norm": 0.28515625, "learning_rate": 0.1, "loss": 2.184537887573242, "step": 6110 }, { "epoch": 0.19403174603174603, "grad_norm": 0.1875, "learning_rate": 0.1, "loss": 2.182258129119873, "step": 6112 }, { "epoch": 0.1940952380952381, "grad_norm": 0.04833984375, "learning_rate": 0.1, "loss": 2.169524669647217, "step": 6114 }, { "epoch": 0.19415873015873017, "grad_norm": 0.11181640625, "learning_rate": 0.1, "loss": 2.212264060974121, "step": 6116 }, { "epoch": 0.1942222222222222, "grad_norm": 0.1591796875, "learning_rate": 0.1, "loss": 2.165604591369629, "step": 6118 }, { "epoch": 0.19428571428571428, "grad_norm": 0.349609375, "learning_rate": 0.1, "loss": 2.1764636039733887, "step": 6120 }, { "epoch": 0.19434920634920635, "grad_norm": 0.2392578125, "learning_rate": 0.1, "loss": 2.1724045276641846, "step": 6122 }, { "epoch": 0.19441269841269843, "grad_norm": 0.053466796875, "learning_rate": 0.1, "loss": 2.143773317337036, "step": 6124 }, { "epoch": 0.19447619047619047, "grad_norm": 0.197265625, "learning_rate": 0.1, "loss": 2.1675078868865967, "step": 6126 }, { "epoch": 0.19453968253968254, "grad_norm": 0.1962890625, "learning_rate": 0.1, "loss": 2.180356502532959, "step": 6128 }, { "epoch": 0.1946031746031746, "grad_norm": 0.220703125, "learning_rate": 0.1, "loss": 2.1821236610412598, "step": 6130 }, { "epoch": 0.19466666666666665, "grad_norm": 0.236328125, "learning_rate": 0.1, "loss": 2.1550347805023193, "step": 6132 }, { "epoch": 0.19473015873015873, "grad_norm": 0.17578125, "learning_rate": 0.1, "loss": 2.184617757797241, "step": 6134 }, { "epoch": 0.1947936507936508, "grad_norm": 0.2099609375, "learning_rate": 0.1, "loss": 2.2026443481445312, "step": 6136 }, { "epoch": 0.19485714285714287, "grad_norm": 0.1533203125, "learning_rate": 0.1, "loss": 2.2038261890411377, "step": 6138 }, { "epoch": 0.1949206349206349, "grad_norm": 0.099609375, "learning_rate": 0.1, "loss": 2.1637344360351562, "step": 6140 }, { "epoch": 0.19498412698412698, "grad_norm": 0.07275390625, "learning_rate": 0.1, "loss": 2.184546947479248, "step": 6142 }, { "epoch": 0.19504761904761905, "grad_norm": 0.1806640625, "learning_rate": 0.1, "loss": 2.2061712741851807, "step": 6144 }, { "epoch": 0.19511111111111112, "grad_norm": 0.1416015625, "learning_rate": 0.1, "loss": 2.1704251766204834, "step": 6146 }, { "epoch": 0.19517460317460317, "grad_norm": 0.09716796875, "learning_rate": 0.1, "loss": 2.1813440322875977, "step": 6148 }, { "epoch": 0.19523809523809524, "grad_norm": 0.126953125, "learning_rate": 0.1, "loss": 2.1940746307373047, "step": 6150 }, { "epoch": 0.1953015873015873, "grad_norm": 0.24609375, "learning_rate": 0.1, "loss": 2.1678903102874756, "step": 6152 }, { "epoch": 0.19536507936507935, "grad_norm": 0.439453125, "learning_rate": 0.1, "loss": 2.2087082862854004, "step": 6154 }, { "epoch": 0.19542857142857142, "grad_norm": 0.1796875, "learning_rate": 0.1, "loss": 2.1911754608154297, "step": 6156 }, { "epoch": 0.1954920634920635, "grad_norm": 0.07373046875, "learning_rate": 0.1, "loss": 2.1483917236328125, "step": 6158 }, { "epoch": 0.19555555555555557, "grad_norm": 0.09765625, "learning_rate": 0.1, "loss": 2.189765214920044, "step": 6160 }, { "epoch": 0.1956190476190476, "grad_norm": 0.20703125, "learning_rate": 0.1, "loss": 2.1673147678375244, "step": 6162 }, { "epoch": 0.19568253968253968, "grad_norm": 0.11865234375, "learning_rate": 0.1, "loss": 2.1691842079162598, "step": 6164 }, { "epoch": 0.19574603174603175, "grad_norm": 0.1259765625, "learning_rate": 0.1, "loss": 2.177700996398926, "step": 6166 }, { "epoch": 0.19580952380952382, "grad_norm": 0.1435546875, "learning_rate": 0.1, "loss": 2.1793482303619385, "step": 6168 }, { "epoch": 0.19587301587301587, "grad_norm": 0.2255859375, "learning_rate": 0.1, "loss": 2.193760871887207, "step": 6170 }, { "epoch": 0.19593650793650794, "grad_norm": 0.203125, "learning_rate": 0.1, "loss": 2.1806981563568115, "step": 6172 }, { "epoch": 0.196, "grad_norm": 0.189453125, "learning_rate": 0.1, "loss": 2.159386157989502, "step": 6174 }, { "epoch": 0.19606349206349205, "grad_norm": 0.2421875, "learning_rate": 0.1, "loss": 2.1702191829681396, "step": 6176 }, { "epoch": 0.19612698412698412, "grad_norm": 0.1904296875, "learning_rate": 0.1, "loss": 2.1907780170440674, "step": 6178 }, { "epoch": 0.1961904761904762, "grad_norm": 0.055419921875, "learning_rate": 0.1, "loss": 2.1590397357940674, "step": 6180 }, { "epoch": 0.19625396825396826, "grad_norm": 0.06396484375, "learning_rate": 0.1, "loss": 2.1713504791259766, "step": 6182 }, { "epoch": 0.1963174603174603, "grad_norm": 0.138671875, "learning_rate": 0.1, "loss": 2.1719706058502197, "step": 6184 }, { "epoch": 0.19638095238095238, "grad_norm": 0.06884765625, "learning_rate": 0.1, "loss": 2.1600863933563232, "step": 6186 }, { "epoch": 0.19644444444444445, "grad_norm": 0.173828125, "learning_rate": 0.1, "loss": 2.158698558807373, "step": 6188 }, { "epoch": 0.19650793650793652, "grad_norm": 0.3203125, "learning_rate": 0.1, "loss": 2.159611463546753, "step": 6190 }, { "epoch": 0.19657142857142856, "grad_norm": 0.169921875, "learning_rate": 0.1, "loss": 2.174875497817993, "step": 6192 }, { "epoch": 0.19663492063492063, "grad_norm": 0.33203125, "learning_rate": 0.1, "loss": 2.158146619796753, "step": 6194 }, { "epoch": 0.1966984126984127, "grad_norm": 0.10888671875, "learning_rate": 0.1, "loss": 2.192859411239624, "step": 6196 }, { "epoch": 0.19676190476190475, "grad_norm": 0.15625, "learning_rate": 0.1, "loss": 2.1683666706085205, "step": 6198 }, { "epoch": 0.19682539682539682, "grad_norm": 0.057373046875, "learning_rate": 0.1, "loss": 2.152535915374756, "step": 6200 }, { "epoch": 0.1968888888888889, "grad_norm": 0.0849609375, "learning_rate": 0.1, "loss": 2.1684515476226807, "step": 6202 }, { "epoch": 0.19695238095238096, "grad_norm": 0.1630859375, "learning_rate": 0.1, "loss": 2.1583399772644043, "step": 6204 }, { "epoch": 0.197015873015873, "grad_norm": 0.2294921875, "learning_rate": 0.1, "loss": 2.1438395977020264, "step": 6206 }, { "epoch": 0.19707936507936508, "grad_norm": 0.07177734375, "learning_rate": 0.1, "loss": 2.180917978286743, "step": 6208 }, { "epoch": 0.19714285714285715, "grad_norm": 0.36328125, "learning_rate": 0.1, "loss": 2.1501803398132324, "step": 6210 }, { "epoch": 0.19720634920634922, "grad_norm": 0.267578125, "learning_rate": 0.1, "loss": 2.1660940647125244, "step": 6212 }, { "epoch": 0.19726984126984126, "grad_norm": 0.1572265625, "learning_rate": 0.1, "loss": 2.134389638900757, "step": 6214 }, { "epoch": 0.19733333333333333, "grad_norm": 0.1865234375, "learning_rate": 0.1, "loss": 2.1730146408081055, "step": 6216 }, { "epoch": 0.1973968253968254, "grad_norm": 0.1806640625, "learning_rate": 0.1, "loss": 2.157869815826416, "step": 6218 }, { "epoch": 0.19746031746031745, "grad_norm": 0.10888671875, "learning_rate": 0.1, "loss": 2.1650173664093018, "step": 6220 }, { "epoch": 0.19752380952380952, "grad_norm": 0.07958984375, "learning_rate": 0.1, "loss": 2.153907537460327, "step": 6222 }, { "epoch": 0.1975873015873016, "grad_norm": 0.1767578125, "learning_rate": 0.1, "loss": 2.155945301055908, "step": 6224 }, { "epoch": 0.19765079365079366, "grad_norm": 0.2119140625, "learning_rate": 0.1, "loss": 2.153611898422241, "step": 6226 }, { "epoch": 0.1977142857142857, "grad_norm": 0.0625, "learning_rate": 0.1, "loss": 2.146571397781372, "step": 6228 }, { "epoch": 0.19777777777777777, "grad_norm": 0.2001953125, "learning_rate": 0.1, "loss": 2.1939034461975098, "step": 6230 }, { "epoch": 0.19784126984126985, "grad_norm": 0.34765625, "learning_rate": 0.1, "loss": 2.164114236831665, "step": 6232 }, { "epoch": 0.19790476190476192, "grad_norm": 0.1396484375, "learning_rate": 0.1, "loss": 2.144237995147705, "step": 6234 }, { "epoch": 0.19796825396825396, "grad_norm": 0.08447265625, "learning_rate": 0.1, "loss": 2.126831293106079, "step": 6236 }, { "epoch": 0.19803174603174603, "grad_norm": 0.10986328125, "learning_rate": 0.1, "loss": 2.14612078666687, "step": 6238 }, { "epoch": 0.1980952380952381, "grad_norm": 0.1328125, "learning_rate": 0.1, "loss": 2.1502139568328857, "step": 6240 }, { "epoch": 0.19815873015873015, "grad_norm": 0.208984375, "learning_rate": 0.1, "loss": 2.1577794551849365, "step": 6242 }, { "epoch": 0.19822222222222222, "grad_norm": 0.431640625, "learning_rate": 0.1, "loss": 2.1726956367492676, "step": 6244 }, { "epoch": 0.1982857142857143, "grad_norm": 0.158203125, "learning_rate": 0.1, "loss": 2.156047821044922, "step": 6246 }, { "epoch": 0.19834920634920636, "grad_norm": 0.054443359375, "learning_rate": 0.1, "loss": 2.171511650085449, "step": 6248 }, { "epoch": 0.1984126984126984, "grad_norm": 0.09033203125, "learning_rate": 0.1, "loss": 2.1431000232696533, "step": 6250 }, { "epoch": 0.19847619047619047, "grad_norm": 0.1318359375, "learning_rate": 0.1, "loss": 2.1573305130004883, "step": 6252 }, { "epoch": 0.19853968253968254, "grad_norm": 0.2001953125, "learning_rate": 0.1, "loss": 2.159522294998169, "step": 6254 }, { "epoch": 0.19860317460317461, "grad_norm": 0.11865234375, "learning_rate": 0.1, "loss": 2.132934808731079, "step": 6256 }, { "epoch": 0.19866666666666666, "grad_norm": 0.10400390625, "learning_rate": 0.1, "loss": 2.154473304748535, "step": 6258 }, { "epoch": 0.19873015873015873, "grad_norm": 0.07763671875, "learning_rate": 0.1, "loss": 2.1386988162994385, "step": 6260 }, { "epoch": 0.1987936507936508, "grad_norm": 0.08935546875, "learning_rate": 0.1, "loss": 2.1352548599243164, "step": 6262 }, { "epoch": 0.19885714285714284, "grad_norm": 0.1962890625, "learning_rate": 0.1, "loss": 2.1753270626068115, "step": 6264 }, { "epoch": 0.19892063492063491, "grad_norm": 0.173828125, "learning_rate": 0.1, "loss": 2.1594643592834473, "step": 6266 }, { "epoch": 0.19898412698412699, "grad_norm": 0.08447265625, "learning_rate": 0.1, "loss": 2.1514132022857666, "step": 6268 }, { "epoch": 0.19904761904761906, "grad_norm": 0.357421875, "learning_rate": 0.1, "loss": 2.184725046157837, "step": 6270 }, { "epoch": 0.1991111111111111, "grad_norm": 0.271484375, "learning_rate": 0.1, "loss": 2.1760497093200684, "step": 6272 }, { "epoch": 0.19917460317460317, "grad_norm": 0.2041015625, "learning_rate": 0.1, "loss": 2.1809940338134766, "step": 6274 }, { "epoch": 0.19923809523809524, "grad_norm": 0.06787109375, "learning_rate": 0.1, "loss": 2.1366426944732666, "step": 6276 }, { "epoch": 0.1993015873015873, "grad_norm": 0.051513671875, "learning_rate": 0.1, "loss": 2.155985116958618, "step": 6278 }, { "epoch": 0.19936507936507936, "grad_norm": 0.049560546875, "learning_rate": 0.1, "loss": 2.188028335571289, "step": 6280 }, { "epoch": 0.19942857142857143, "grad_norm": 0.09765625, "learning_rate": 0.1, "loss": 2.1724112033843994, "step": 6282 }, { "epoch": 0.1994920634920635, "grad_norm": 0.08740234375, "learning_rate": 0.1, "loss": 2.173720121383667, "step": 6284 }, { "epoch": 0.19955555555555557, "grad_norm": 0.1748046875, "learning_rate": 0.1, "loss": 2.1264238357543945, "step": 6286 }, { "epoch": 0.1996190476190476, "grad_norm": 0.34765625, "learning_rate": 0.1, "loss": 2.199678421020508, "step": 6288 }, { "epoch": 0.19968253968253968, "grad_norm": 0.421875, "learning_rate": 0.1, "loss": 2.1632678508758545, "step": 6290 }, { "epoch": 0.19974603174603175, "grad_norm": 0.1640625, "learning_rate": 0.1, "loss": 2.1516315937042236, "step": 6292 }, { "epoch": 0.1998095238095238, "grad_norm": 0.19140625, "learning_rate": 0.1, "loss": 2.1713647842407227, "step": 6294 }, { "epoch": 0.19987301587301587, "grad_norm": 0.0791015625, "learning_rate": 0.1, "loss": 2.1505002975463867, "step": 6296 }, { "epoch": 0.19993650793650794, "grad_norm": 0.248046875, "learning_rate": 0.1, "loss": 2.1591060161590576, "step": 6298 }, { "epoch": 0.2, "grad_norm": 0.1328125, "learning_rate": 0.1, "loss": 2.1397767066955566, "step": 6300 }, { "epoch": 0.2, "eval_loss": 1.789380669593811, "eval_runtime": 130.8913, "eval_samples_per_second": 8.114, "eval_steps_per_second": 2.032, "step": 6300 }, { "epoch": 0.20006349206349205, "grad_norm": 0.19921875, "learning_rate": 0.1, "loss": 2.134932518005371, "step": 6302 }, { "epoch": 0.20012698412698413, "grad_norm": 0.310546875, "learning_rate": 0.1, "loss": 2.1447412967681885, "step": 6304 }, { "epoch": 0.2001904761904762, "grad_norm": 0.248046875, "learning_rate": 0.1, "loss": 2.149040937423706, "step": 6306 }, { "epoch": 0.20025396825396827, "grad_norm": 0.1396484375, "learning_rate": 0.1, "loss": 2.1511216163635254, "step": 6308 }, { "epoch": 0.2003174603174603, "grad_norm": 0.06884765625, "learning_rate": 0.1, "loss": 2.15629506111145, "step": 6310 }, { "epoch": 0.20038095238095238, "grad_norm": 0.0908203125, "learning_rate": 0.1, "loss": 2.176990032196045, "step": 6312 }, { "epoch": 0.20044444444444445, "grad_norm": 0.1708984375, "learning_rate": 0.1, "loss": 2.1630804538726807, "step": 6314 }, { "epoch": 0.2005079365079365, "grad_norm": 0.1875, "learning_rate": 0.1, "loss": 2.1452455520629883, "step": 6316 }, { "epoch": 0.20057142857142857, "grad_norm": 0.15625, "learning_rate": 0.1, "loss": 2.173180341720581, "step": 6318 }, { "epoch": 0.20063492063492064, "grad_norm": 0.232421875, "learning_rate": 0.1, "loss": 2.144226312637329, "step": 6320 }, { "epoch": 0.2006984126984127, "grad_norm": 0.068359375, "learning_rate": 0.1, "loss": 2.147150993347168, "step": 6322 }, { "epoch": 0.20076190476190475, "grad_norm": 0.138671875, "learning_rate": 0.1, "loss": 2.1569759845733643, "step": 6324 }, { "epoch": 0.20082539682539682, "grad_norm": 0.11279296875, "learning_rate": 0.1, "loss": 2.143246650695801, "step": 6326 }, { "epoch": 0.2008888888888889, "grad_norm": 0.1318359375, "learning_rate": 0.1, "loss": 2.1478564739227295, "step": 6328 }, { "epoch": 0.20095238095238097, "grad_norm": 0.08447265625, "learning_rate": 0.1, "loss": 2.1465587615966797, "step": 6330 }, { "epoch": 0.201015873015873, "grad_norm": 0.1279296875, "learning_rate": 0.1, "loss": 2.1456310749053955, "step": 6332 }, { "epoch": 0.20107936507936508, "grad_norm": 0.06201171875, "learning_rate": 0.1, "loss": 2.200171947479248, "step": 6334 }, { "epoch": 0.20114285714285715, "grad_norm": 0.197265625, "learning_rate": 0.1, "loss": 2.1393373012542725, "step": 6336 }, { "epoch": 0.2012063492063492, "grad_norm": 0.283203125, "learning_rate": 0.1, "loss": 2.1357507705688477, "step": 6338 }, { "epoch": 0.20126984126984127, "grad_norm": 0.3359375, "learning_rate": 0.1, "loss": 2.1747095584869385, "step": 6340 }, { "epoch": 0.20133333333333334, "grad_norm": 0.1845703125, "learning_rate": 0.1, "loss": 2.1276538372039795, "step": 6342 }, { "epoch": 0.2013968253968254, "grad_norm": 0.1162109375, "learning_rate": 0.1, "loss": 2.1489195823669434, "step": 6344 }, { "epoch": 0.20146031746031745, "grad_norm": 0.212890625, "learning_rate": 0.1, "loss": 2.1829347610473633, "step": 6346 }, { "epoch": 0.20152380952380952, "grad_norm": 0.30859375, "learning_rate": 0.1, "loss": 2.149980306625366, "step": 6348 }, { "epoch": 0.2015873015873016, "grad_norm": 0.0810546875, "learning_rate": 0.1, "loss": 2.1677029132843018, "step": 6350 }, { "epoch": 0.20165079365079366, "grad_norm": 0.056884765625, "learning_rate": 0.1, "loss": 2.199399471282959, "step": 6352 }, { "epoch": 0.2017142857142857, "grad_norm": 0.2119140625, "learning_rate": 0.1, "loss": 2.1760952472686768, "step": 6354 }, { "epoch": 0.20177777777777778, "grad_norm": 0.095703125, "learning_rate": 0.1, "loss": 2.150946855545044, "step": 6356 }, { "epoch": 0.20184126984126985, "grad_norm": 0.06787109375, "learning_rate": 0.1, "loss": 2.143873453140259, "step": 6358 }, { "epoch": 0.2019047619047619, "grad_norm": 0.09521484375, "learning_rate": 0.1, "loss": 2.1785662174224854, "step": 6360 }, { "epoch": 0.20196825396825396, "grad_norm": 0.2578125, "learning_rate": 0.1, "loss": 2.160181999206543, "step": 6362 }, { "epoch": 0.20203174603174603, "grad_norm": 0.185546875, "learning_rate": 0.1, "loss": 2.1712963581085205, "step": 6364 }, { "epoch": 0.2020952380952381, "grad_norm": 0.10888671875, "learning_rate": 0.1, "loss": 2.1563289165496826, "step": 6366 }, { "epoch": 0.20215873015873015, "grad_norm": 0.28125, "learning_rate": 0.1, "loss": 2.1659274101257324, "step": 6368 }, { "epoch": 0.20222222222222222, "grad_norm": 0.1416015625, "learning_rate": 0.1, "loss": 2.1915102005004883, "step": 6370 }, { "epoch": 0.2022857142857143, "grad_norm": 0.03466796875, "learning_rate": 0.1, "loss": 2.17429780960083, "step": 6372 }, { "epoch": 0.20234920634920636, "grad_norm": 0.068359375, "learning_rate": 0.1, "loss": 2.1632392406463623, "step": 6374 }, { "epoch": 0.2024126984126984, "grad_norm": 0.115234375, "learning_rate": 0.1, "loss": 2.1871325969696045, "step": 6376 }, { "epoch": 0.20247619047619048, "grad_norm": 0.08544921875, "learning_rate": 0.1, "loss": 2.1641998291015625, "step": 6378 }, { "epoch": 0.20253968253968255, "grad_norm": 0.09765625, "learning_rate": 0.1, "loss": 2.121339797973633, "step": 6380 }, { "epoch": 0.2026031746031746, "grad_norm": 0.28515625, "learning_rate": 0.1, "loss": 2.1830854415893555, "step": 6382 }, { "epoch": 0.20266666666666666, "grad_norm": 0.447265625, "learning_rate": 0.1, "loss": 2.188486099243164, "step": 6384 }, { "epoch": 0.20273015873015873, "grad_norm": 0.0927734375, "learning_rate": 0.1, "loss": 2.1872661113739014, "step": 6386 }, { "epoch": 0.2027936507936508, "grad_norm": 0.193359375, "learning_rate": 0.1, "loss": 2.1438026428222656, "step": 6388 }, { "epoch": 0.20285714285714285, "grad_norm": 0.1201171875, "learning_rate": 0.1, "loss": 2.1825153827667236, "step": 6390 }, { "epoch": 0.20292063492063492, "grad_norm": 0.1708984375, "learning_rate": 0.1, "loss": 2.1712558269500732, "step": 6392 }, { "epoch": 0.202984126984127, "grad_norm": 0.271484375, "learning_rate": 0.1, "loss": 2.1665148735046387, "step": 6394 }, { "epoch": 0.20304761904761906, "grad_norm": 0.34375, "learning_rate": 0.1, "loss": 2.1644949913024902, "step": 6396 }, { "epoch": 0.2031111111111111, "grad_norm": 0.142578125, "learning_rate": 0.1, "loss": 2.1723225116729736, "step": 6398 }, { "epoch": 0.20317460317460317, "grad_norm": 0.057861328125, "learning_rate": 0.1, "loss": 2.200850009918213, "step": 6400 }, { "epoch": 0.20323809523809525, "grad_norm": 0.060791015625, "learning_rate": 0.1, "loss": 2.1580796241760254, "step": 6402 }, { "epoch": 0.2033015873015873, "grad_norm": 0.080078125, "learning_rate": 0.1, "loss": 2.1425533294677734, "step": 6404 }, { "epoch": 0.20336507936507936, "grad_norm": 0.1669921875, "learning_rate": 0.1, "loss": 2.140320301055908, "step": 6406 }, { "epoch": 0.20342857142857143, "grad_norm": 0.27734375, "learning_rate": 0.1, "loss": 2.1683692932128906, "step": 6408 }, { "epoch": 0.2034920634920635, "grad_norm": 0.13671875, "learning_rate": 0.1, "loss": 2.1687135696411133, "step": 6410 }, { "epoch": 0.20355555555555555, "grad_norm": 0.07958984375, "learning_rate": 0.1, "loss": 2.149510383605957, "step": 6412 }, { "epoch": 0.20361904761904762, "grad_norm": 0.091796875, "learning_rate": 0.1, "loss": 2.148559093475342, "step": 6414 }, { "epoch": 0.2036825396825397, "grad_norm": 0.32421875, "learning_rate": 0.1, "loss": 2.182732582092285, "step": 6416 }, { "epoch": 0.20374603174603176, "grad_norm": 0.267578125, "learning_rate": 0.1, "loss": 2.1986923217773438, "step": 6418 }, { "epoch": 0.2038095238095238, "grad_norm": 0.06298828125, "learning_rate": 0.1, "loss": 2.169917583465576, "step": 6420 }, { "epoch": 0.20387301587301587, "grad_norm": 0.060546875, "learning_rate": 0.1, "loss": 2.1918654441833496, "step": 6422 }, { "epoch": 0.20393650793650794, "grad_norm": 0.154296875, "learning_rate": 0.1, "loss": 2.173410654067993, "step": 6424 }, { "epoch": 0.204, "grad_norm": 0.1259765625, "learning_rate": 0.1, "loss": 2.150686025619507, "step": 6426 }, { "epoch": 0.20406349206349206, "grad_norm": 0.06494140625, "learning_rate": 0.1, "loss": 2.189450263977051, "step": 6428 }, { "epoch": 0.20412698412698413, "grad_norm": 0.171875, "learning_rate": 0.1, "loss": 2.1734201908111572, "step": 6430 }, { "epoch": 0.2041904761904762, "grad_norm": 0.283203125, "learning_rate": 0.1, "loss": 2.1359946727752686, "step": 6432 }, { "epoch": 0.20425396825396824, "grad_norm": 0.115234375, "learning_rate": 0.1, "loss": 2.1671946048736572, "step": 6434 }, { "epoch": 0.20431746031746031, "grad_norm": 0.130859375, "learning_rate": 0.1, "loss": 2.1869983673095703, "step": 6436 }, { "epoch": 0.20438095238095239, "grad_norm": 0.1728515625, "learning_rate": 0.1, "loss": 2.186223030090332, "step": 6438 }, { "epoch": 0.20444444444444446, "grad_norm": 0.2109375, "learning_rate": 0.1, "loss": 2.1797330379486084, "step": 6440 }, { "epoch": 0.2045079365079365, "grad_norm": 0.1787109375, "learning_rate": 0.1, "loss": 2.1618194580078125, "step": 6442 }, { "epoch": 0.20457142857142857, "grad_norm": 0.1396484375, "learning_rate": 0.1, "loss": 2.171804666519165, "step": 6444 }, { "epoch": 0.20463492063492064, "grad_norm": 0.1298828125, "learning_rate": 0.1, "loss": 2.1951966285705566, "step": 6446 }, { "epoch": 0.20469841269841269, "grad_norm": 0.10693359375, "learning_rate": 0.1, "loss": 2.2053847312927246, "step": 6448 }, { "epoch": 0.20476190476190476, "grad_norm": 0.1962890625, "learning_rate": 0.1, "loss": 2.2122716903686523, "step": 6450 }, { "epoch": 0.20482539682539683, "grad_norm": 0.3671875, "learning_rate": 0.1, "loss": 2.1894750595092773, "step": 6452 }, { "epoch": 0.2048888888888889, "grad_norm": 0.11865234375, "learning_rate": 0.1, "loss": 2.2260262966156006, "step": 6454 }, { "epoch": 0.20495238095238094, "grad_norm": 0.2001953125, "learning_rate": 0.1, "loss": 2.2473971843719482, "step": 6456 }, { "epoch": 0.205015873015873, "grad_norm": 0.1279296875, "learning_rate": 0.1, "loss": 2.157022714614868, "step": 6458 }, { "epoch": 0.20507936507936508, "grad_norm": 0.109375, "learning_rate": 0.1, "loss": 2.173673391342163, "step": 6460 }, { "epoch": 0.20514285714285715, "grad_norm": 0.126953125, "learning_rate": 0.1, "loss": 2.1849818229675293, "step": 6462 }, { "epoch": 0.2052063492063492, "grad_norm": 0.2255859375, "learning_rate": 0.1, "loss": 2.2054717540740967, "step": 6464 }, { "epoch": 0.20526984126984127, "grad_norm": 0.154296875, "learning_rate": 0.1, "loss": 2.195793390274048, "step": 6466 }, { "epoch": 0.20533333333333334, "grad_norm": 0.16015625, "learning_rate": 0.1, "loss": 2.2241098880767822, "step": 6468 }, { "epoch": 0.20539682539682538, "grad_norm": 0.2890625, "learning_rate": 0.1, "loss": 2.2238736152648926, "step": 6470 }, { "epoch": 0.20546031746031745, "grad_norm": 0.115234375, "learning_rate": 0.1, "loss": 2.1919901371002197, "step": 6472 }, { "epoch": 0.20552380952380953, "grad_norm": 0.0703125, "learning_rate": 0.1, "loss": 2.1945674419403076, "step": 6474 }, { "epoch": 0.2055873015873016, "grad_norm": 0.09912109375, "learning_rate": 0.1, "loss": 2.2041218280792236, "step": 6476 }, { "epoch": 0.20565079365079364, "grad_norm": 0.0830078125, "learning_rate": 0.1, "loss": 2.1953349113464355, "step": 6478 }, { "epoch": 0.2057142857142857, "grad_norm": 0.09619140625, "learning_rate": 0.1, "loss": 2.2183542251586914, "step": 6480 }, { "epoch": 0.20577777777777778, "grad_norm": 0.2021484375, "learning_rate": 0.1, "loss": 2.1788268089294434, "step": 6482 }, { "epoch": 0.20584126984126985, "grad_norm": 0.4140625, "learning_rate": 0.1, "loss": 2.183762788772583, "step": 6484 }, { "epoch": 0.2059047619047619, "grad_norm": 0.1337890625, "learning_rate": 0.1, "loss": 2.1801440715789795, "step": 6486 }, { "epoch": 0.20596825396825397, "grad_norm": 0.0810546875, "learning_rate": 0.1, "loss": 2.166905164718628, "step": 6488 }, { "epoch": 0.20603174603174604, "grad_norm": 0.19140625, "learning_rate": 0.1, "loss": 2.186269521713257, "step": 6490 }, { "epoch": 0.2060952380952381, "grad_norm": 0.11376953125, "learning_rate": 0.1, "loss": 2.1890735626220703, "step": 6492 }, { "epoch": 0.20615873015873015, "grad_norm": 0.185546875, "learning_rate": 0.1, "loss": 2.199967384338379, "step": 6494 }, { "epoch": 0.20622222222222222, "grad_norm": 0.2470703125, "learning_rate": 0.1, "loss": 2.1831860542297363, "step": 6496 }, { "epoch": 0.2062857142857143, "grad_norm": 0.1884765625, "learning_rate": 0.1, "loss": 2.1659393310546875, "step": 6498 }, { "epoch": 0.20634920634920634, "grad_norm": 0.06494140625, "learning_rate": 0.1, "loss": 2.1628899574279785, "step": 6500 }, { "epoch": 0.2064126984126984, "grad_norm": 0.078125, "learning_rate": 0.1, "loss": 2.198374032974243, "step": 6502 }, { "epoch": 0.20647619047619048, "grad_norm": 0.10498046875, "learning_rate": 0.1, "loss": 2.1678342819213867, "step": 6504 }, { "epoch": 0.20653968253968255, "grad_norm": 0.054443359375, "learning_rate": 0.1, "loss": 2.1714112758636475, "step": 6506 }, { "epoch": 0.2066031746031746, "grad_norm": 0.11962890625, "learning_rate": 0.1, "loss": 2.1780455112457275, "step": 6508 }, { "epoch": 0.20666666666666667, "grad_norm": 0.2216796875, "learning_rate": 0.1, "loss": 2.1829216480255127, "step": 6510 }, { "epoch": 0.20673015873015874, "grad_norm": 0.10302734375, "learning_rate": 0.1, "loss": 2.1540040969848633, "step": 6512 }, { "epoch": 0.2067936507936508, "grad_norm": 0.1728515625, "learning_rate": 0.1, "loss": 2.156402111053467, "step": 6514 }, { "epoch": 0.20685714285714285, "grad_norm": 0.322265625, "learning_rate": 0.1, "loss": 2.1930651664733887, "step": 6516 }, { "epoch": 0.20692063492063492, "grad_norm": 0.36328125, "learning_rate": 0.1, "loss": 2.1767616271972656, "step": 6518 }, { "epoch": 0.206984126984127, "grad_norm": 0.09716796875, "learning_rate": 0.1, "loss": 2.1647439002990723, "step": 6520 }, { "epoch": 0.20704761904761904, "grad_norm": 0.1201171875, "learning_rate": 0.1, "loss": 2.1798510551452637, "step": 6522 }, { "epoch": 0.2071111111111111, "grad_norm": 0.08056640625, "learning_rate": 0.1, "loss": 2.1570374965667725, "step": 6524 }, { "epoch": 0.20717460317460318, "grad_norm": 0.1083984375, "learning_rate": 0.1, "loss": 2.148193597793579, "step": 6526 }, { "epoch": 0.20723809523809525, "grad_norm": 0.28125, "learning_rate": 0.1, "loss": 2.1497747898101807, "step": 6528 }, { "epoch": 0.2073015873015873, "grad_norm": 0.318359375, "learning_rate": 0.1, "loss": 2.1393697261810303, "step": 6530 }, { "epoch": 0.20736507936507936, "grad_norm": 0.298828125, "learning_rate": 0.1, "loss": 2.176668882369995, "step": 6532 }, { "epoch": 0.20742857142857143, "grad_norm": 0.11083984375, "learning_rate": 0.1, "loss": 2.164783239364624, "step": 6534 }, { "epoch": 0.2074920634920635, "grad_norm": 0.1318359375, "learning_rate": 0.1, "loss": 2.1341679096221924, "step": 6536 }, { "epoch": 0.20755555555555555, "grad_norm": 0.09423828125, "learning_rate": 0.1, "loss": 2.1626336574554443, "step": 6538 }, { "epoch": 0.20761904761904762, "grad_norm": 0.0859375, "learning_rate": 0.1, "loss": 2.136343002319336, "step": 6540 }, { "epoch": 0.2076825396825397, "grad_norm": 0.056396484375, "learning_rate": 0.1, "loss": 2.168395757675171, "step": 6542 }, { "epoch": 0.20774603174603173, "grad_norm": 0.07421875, "learning_rate": 0.1, "loss": 2.1447694301605225, "step": 6544 }, { "epoch": 0.2078095238095238, "grad_norm": 0.10009765625, "learning_rate": 0.1, "loss": 2.17566180229187, "step": 6546 }, { "epoch": 0.20787301587301588, "grad_norm": 0.07373046875, "learning_rate": 0.1, "loss": 2.1691384315490723, "step": 6548 }, { "epoch": 0.20793650793650795, "grad_norm": 0.1044921875, "learning_rate": 0.1, "loss": 2.1774306297302246, "step": 6550 }, { "epoch": 0.208, "grad_norm": 0.1005859375, "learning_rate": 0.1, "loss": 2.1608052253723145, "step": 6552 }, { "epoch": 0.20806349206349206, "grad_norm": 0.103515625, "learning_rate": 0.1, "loss": 2.129629135131836, "step": 6554 }, { "epoch": 0.20812698412698413, "grad_norm": 0.2021484375, "learning_rate": 0.1, "loss": 2.16937255859375, "step": 6556 }, { "epoch": 0.2081904761904762, "grad_norm": 0.2021484375, "learning_rate": 0.1, "loss": 2.1647653579711914, "step": 6558 }, { "epoch": 0.20825396825396825, "grad_norm": 0.8125, "learning_rate": 0.1, "loss": 2.1495988368988037, "step": 6560 }, { "epoch": 0.20831746031746032, "grad_norm": 0.177734375, "learning_rate": 0.1, "loss": 2.1310501098632812, "step": 6562 }, { "epoch": 0.2083809523809524, "grad_norm": 0.1865234375, "learning_rate": 0.1, "loss": 2.140765905380249, "step": 6564 }, { "epoch": 0.20844444444444443, "grad_norm": 0.181640625, "learning_rate": 0.1, "loss": 2.1549031734466553, "step": 6566 }, { "epoch": 0.2085079365079365, "grad_norm": 0.189453125, "learning_rate": 0.1, "loss": 2.144796371459961, "step": 6568 }, { "epoch": 0.20857142857142857, "grad_norm": 0.0859375, "learning_rate": 0.1, "loss": 2.1371405124664307, "step": 6570 }, { "epoch": 0.20863492063492065, "grad_norm": 0.091796875, "learning_rate": 0.1, "loss": 2.1624667644500732, "step": 6572 }, { "epoch": 0.2086984126984127, "grad_norm": 0.06884765625, "learning_rate": 0.1, "loss": 2.16226863861084, "step": 6574 }, { "epoch": 0.20876190476190476, "grad_norm": 0.053466796875, "learning_rate": 0.1, "loss": 2.129021644592285, "step": 6576 }, { "epoch": 0.20882539682539683, "grad_norm": 0.140625, "learning_rate": 0.1, "loss": 2.1647744178771973, "step": 6578 }, { "epoch": 0.2088888888888889, "grad_norm": 0.1455078125, "learning_rate": 0.1, "loss": 2.1531848907470703, "step": 6580 }, { "epoch": 0.20895238095238095, "grad_norm": 0.0908203125, "learning_rate": 0.1, "loss": 2.1342742443084717, "step": 6582 }, { "epoch": 0.20901587301587302, "grad_norm": 0.14453125, "learning_rate": 0.1, "loss": 2.136016845703125, "step": 6584 }, { "epoch": 0.2090793650793651, "grad_norm": 0.1376953125, "learning_rate": 0.1, "loss": 2.1336145401000977, "step": 6586 }, { "epoch": 0.20914285714285713, "grad_norm": 0.158203125, "learning_rate": 0.1, "loss": 2.163861036300659, "step": 6588 }, { "epoch": 0.2092063492063492, "grad_norm": 0.05810546875, "learning_rate": 0.1, "loss": 2.1170730590820312, "step": 6590 }, { "epoch": 0.20926984126984127, "grad_norm": 0.1298828125, "learning_rate": 0.1, "loss": 2.111055374145508, "step": 6592 }, { "epoch": 0.20933333333333334, "grad_norm": 0.1396484375, "learning_rate": 0.1, "loss": 2.1412017345428467, "step": 6594 }, { "epoch": 0.2093968253968254, "grad_norm": 0.107421875, "learning_rate": 0.1, "loss": 2.1640784740448, "step": 6596 }, { "epoch": 0.20946031746031746, "grad_norm": 0.1259765625, "learning_rate": 0.1, "loss": 2.1227312088012695, "step": 6598 }, { "epoch": 0.20952380952380953, "grad_norm": 0.21875, "learning_rate": 0.1, "loss": 2.1380250453948975, "step": 6600 }, { "epoch": 0.2095873015873016, "grad_norm": 0.34375, "learning_rate": 0.1, "loss": 2.1244192123413086, "step": 6602 }, { "epoch": 0.20965079365079364, "grad_norm": 0.0673828125, "learning_rate": 0.1, "loss": 2.115748167037964, "step": 6604 }, { "epoch": 0.20971428571428571, "grad_norm": 0.201171875, "learning_rate": 0.1, "loss": 2.1327924728393555, "step": 6606 }, { "epoch": 0.20977777777777779, "grad_norm": 0.26953125, "learning_rate": 0.1, "loss": 2.0948684215545654, "step": 6608 }, { "epoch": 0.20984126984126983, "grad_norm": 0.078125, "learning_rate": 0.1, "loss": 2.137751817703247, "step": 6610 }, { "epoch": 0.2099047619047619, "grad_norm": 0.109375, "learning_rate": 0.1, "loss": 2.117772102355957, "step": 6612 }, { "epoch": 0.20996825396825397, "grad_norm": 0.103515625, "learning_rate": 0.1, "loss": 2.1199865341186523, "step": 6614 }, { "epoch": 0.21003174603174604, "grad_norm": 0.04638671875, "learning_rate": 0.1, "loss": 2.093437671661377, "step": 6616 }, { "epoch": 0.21009523809523808, "grad_norm": 0.0537109375, "learning_rate": 0.1, "loss": 2.145495653152466, "step": 6618 }, { "epoch": 0.21015873015873016, "grad_norm": 0.251953125, "learning_rate": 0.1, "loss": 2.125392436981201, "step": 6620 }, { "epoch": 0.21022222222222223, "grad_norm": 0.20703125, "learning_rate": 0.1, "loss": 2.1294708251953125, "step": 6622 }, { "epoch": 0.2102857142857143, "grad_norm": 0.1875, "learning_rate": 0.1, "loss": 2.132856607437134, "step": 6624 }, { "epoch": 0.21034920634920634, "grad_norm": 0.115234375, "learning_rate": 0.1, "loss": 2.1094329357147217, "step": 6626 }, { "epoch": 0.2104126984126984, "grad_norm": 0.251953125, "learning_rate": 0.1, "loss": 2.1459603309631348, "step": 6628 }, { "epoch": 0.21047619047619048, "grad_norm": 0.140625, "learning_rate": 0.1, "loss": 2.1103713512420654, "step": 6630 }, { "epoch": 0.21053968253968253, "grad_norm": 0.166015625, "learning_rate": 0.1, "loss": 2.122328042984009, "step": 6632 }, { "epoch": 0.2106031746031746, "grad_norm": 0.232421875, "learning_rate": 0.1, "loss": 2.1227426528930664, "step": 6634 }, { "epoch": 0.21066666666666667, "grad_norm": 0.19921875, "learning_rate": 0.1, "loss": 2.0971546173095703, "step": 6636 }, { "epoch": 0.21073015873015874, "grad_norm": 0.2333984375, "learning_rate": 0.1, "loss": 2.1275570392608643, "step": 6638 }, { "epoch": 0.21079365079365078, "grad_norm": 0.1669921875, "learning_rate": 0.1, "loss": 2.1402499675750732, "step": 6640 }, { "epoch": 0.21085714285714285, "grad_norm": 0.11669921875, "learning_rate": 0.1, "loss": 2.096700668334961, "step": 6642 }, { "epoch": 0.21092063492063493, "grad_norm": 0.05126953125, "learning_rate": 0.1, "loss": 2.129594326019287, "step": 6644 }, { "epoch": 0.210984126984127, "grad_norm": 0.0888671875, "learning_rate": 0.1, "loss": 2.1113107204437256, "step": 6646 }, { "epoch": 0.21104761904761904, "grad_norm": 0.169921875, "learning_rate": 0.1, "loss": 2.0739781856536865, "step": 6648 }, { "epoch": 0.2111111111111111, "grad_norm": 0.212890625, "learning_rate": 0.1, "loss": 2.109806776046753, "step": 6650 }, { "epoch": 0.21117460317460318, "grad_norm": 0.2158203125, "learning_rate": 0.1, "loss": 2.115232467651367, "step": 6652 }, { "epoch": 0.21123809523809522, "grad_norm": 0.2041015625, "learning_rate": 0.1, "loss": 2.1579253673553467, "step": 6654 }, { "epoch": 0.2113015873015873, "grad_norm": 0.1826171875, "learning_rate": 0.1, "loss": 2.121755599975586, "step": 6656 }, { "epoch": 0.21136507936507937, "grad_norm": 0.1181640625, "learning_rate": 0.1, "loss": 2.113680362701416, "step": 6658 }, { "epoch": 0.21142857142857144, "grad_norm": 0.2265625, "learning_rate": 0.1, "loss": 2.0926313400268555, "step": 6660 }, { "epoch": 0.21149206349206348, "grad_norm": 0.36328125, "learning_rate": 0.1, "loss": 2.09521484375, "step": 6662 }, { "epoch": 0.21155555555555555, "grad_norm": 0.09765625, "learning_rate": 0.1, "loss": 2.0832061767578125, "step": 6664 }, { "epoch": 0.21161904761904762, "grad_norm": 0.166015625, "learning_rate": 0.1, "loss": 2.0606465339660645, "step": 6666 }, { "epoch": 0.2116825396825397, "grad_norm": 0.275390625, "learning_rate": 0.1, "loss": 2.1284737586975098, "step": 6668 }, { "epoch": 0.21174603174603174, "grad_norm": 0.2177734375, "learning_rate": 0.1, "loss": 2.0906856060028076, "step": 6670 }, { "epoch": 0.2118095238095238, "grad_norm": 0.119140625, "learning_rate": 0.1, "loss": 2.1249935626983643, "step": 6672 }, { "epoch": 0.21187301587301588, "grad_norm": 0.0986328125, "learning_rate": 0.1, "loss": 2.09568452835083, "step": 6674 }, { "epoch": 0.21193650793650792, "grad_norm": 0.08642578125, "learning_rate": 0.1, "loss": 2.0987675189971924, "step": 6676 }, { "epoch": 0.212, "grad_norm": 0.0908203125, "learning_rate": 0.1, "loss": 2.106635093688965, "step": 6678 }, { "epoch": 0.21206349206349207, "grad_norm": 0.125, "learning_rate": 0.1, "loss": 2.1069960594177246, "step": 6680 }, { "epoch": 0.21212698412698414, "grad_norm": 0.2158203125, "learning_rate": 0.1, "loss": 2.113905429840088, "step": 6682 }, { "epoch": 0.21219047619047618, "grad_norm": 0.1640625, "learning_rate": 0.1, "loss": 2.1124913692474365, "step": 6684 }, { "epoch": 0.21225396825396825, "grad_norm": 0.10595703125, "learning_rate": 0.1, "loss": 2.1210591793060303, "step": 6686 }, { "epoch": 0.21231746031746032, "grad_norm": 0.1494140625, "learning_rate": 0.1, "loss": 2.1135032176971436, "step": 6688 }, { "epoch": 0.2123809523809524, "grad_norm": 0.208984375, "learning_rate": 0.1, "loss": 2.076572895050049, "step": 6690 }, { "epoch": 0.21244444444444444, "grad_norm": 0.2421875, "learning_rate": 0.1, "loss": 2.087473154067993, "step": 6692 }, { "epoch": 0.2125079365079365, "grad_norm": 0.41015625, "learning_rate": 0.1, "loss": 2.133327007293701, "step": 6694 }, { "epoch": 0.21257142857142858, "grad_norm": 0.09326171875, "learning_rate": 0.1, "loss": 2.1153130531311035, "step": 6696 }, { "epoch": 0.21263492063492062, "grad_norm": 0.203125, "learning_rate": 0.1, "loss": 2.1076927185058594, "step": 6698 }, { "epoch": 0.2126984126984127, "grad_norm": 0.1806640625, "learning_rate": 0.1, "loss": 2.1229991912841797, "step": 6700 }, { "epoch": 0.21276190476190476, "grad_norm": 0.1357421875, "learning_rate": 0.1, "loss": 2.106590747833252, "step": 6702 }, { "epoch": 0.21282539682539683, "grad_norm": 0.1923828125, "learning_rate": 0.1, "loss": 2.1124095916748047, "step": 6704 }, { "epoch": 0.21288888888888888, "grad_norm": 0.10986328125, "learning_rate": 0.1, "loss": 2.0907838344573975, "step": 6706 }, { "epoch": 0.21295238095238095, "grad_norm": 0.234375, "learning_rate": 0.1, "loss": 2.1262567043304443, "step": 6708 }, { "epoch": 0.21301587301587302, "grad_norm": 0.09130859375, "learning_rate": 0.1, "loss": 2.0799152851104736, "step": 6710 }, { "epoch": 0.2130793650793651, "grad_norm": 0.10595703125, "learning_rate": 0.1, "loss": 2.1127655506134033, "step": 6712 }, { "epoch": 0.21314285714285713, "grad_norm": 0.04638671875, "learning_rate": 0.1, "loss": 2.1174919605255127, "step": 6714 }, { "epoch": 0.2132063492063492, "grad_norm": 0.07763671875, "learning_rate": 0.1, "loss": 2.1067285537719727, "step": 6716 }, { "epoch": 0.21326984126984128, "grad_norm": 0.0908203125, "learning_rate": 0.1, "loss": 2.136237382888794, "step": 6718 }, { "epoch": 0.21333333333333335, "grad_norm": 0.212890625, "learning_rate": 0.1, "loss": 2.1185741424560547, "step": 6720 }, { "epoch": 0.2133968253968254, "grad_norm": 0.27734375, "learning_rate": 0.1, "loss": 2.1116015911102295, "step": 6722 }, { "epoch": 0.21346031746031746, "grad_norm": 0.306640625, "learning_rate": 0.1, "loss": 2.1079583168029785, "step": 6724 }, { "epoch": 0.21352380952380953, "grad_norm": 0.08154296875, "learning_rate": 0.1, "loss": 2.1007747650146484, "step": 6726 }, { "epoch": 0.21358730158730158, "grad_norm": 0.0634765625, "learning_rate": 0.1, "loss": 2.0923876762390137, "step": 6728 }, { "epoch": 0.21365079365079365, "grad_norm": 0.07666015625, "learning_rate": 0.1, "loss": 2.1071977615356445, "step": 6730 }, { "epoch": 0.21371428571428572, "grad_norm": 0.166015625, "learning_rate": 0.1, "loss": 2.1247448921203613, "step": 6732 }, { "epoch": 0.2137777777777778, "grad_norm": 0.294921875, "learning_rate": 0.1, "loss": 2.0763418674468994, "step": 6734 }, { "epoch": 0.21384126984126983, "grad_norm": 0.042724609375, "learning_rate": 0.1, "loss": 2.1187126636505127, "step": 6736 }, { "epoch": 0.2139047619047619, "grad_norm": 0.09130859375, "learning_rate": 0.1, "loss": 2.1044921875, "step": 6738 }, { "epoch": 0.21396825396825397, "grad_norm": 0.1484375, "learning_rate": 0.1, "loss": 2.103158473968506, "step": 6740 }, { "epoch": 0.21403174603174605, "grad_norm": 0.16796875, "learning_rate": 0.1, "loss": 2.10312819480896, "step": 6742 }, { "epoch": 0.2140952380952381, "grad_norm": 0.263671875, "learning_rate": 0.1, "loss": 2.1251163482666016, "step": 6744 }, { "epoch": 0.21415873015873016, "grad_norm": 0.4921875, "learning_rate": 0.1, "loss": 2.1346023082733154, "step": 6746 }, { "epoch": 0.21422222222222223, "grad_norm": 0.15625, "learning_rate": 0.1, "loss": 2.110842227935791, "step": 6748 }, { "epoch": 0.21428571428571427, "grad_norm": 0.109375, "learning_rate": 0.1, "loss": 2.1459476947784424, "step": 6750 }, { "epoch": 0.21434920634920634, "grad_norm": 0.134765625, "learning_rate": 0.1, "loss": 2.138960599899292, "step": 6752 }, { "epoch": 0.21441269841269842, "grad_norm": 0.05078125, "learning_rate": 0.1, "loss": 2.1437880992889404, "step": 6754 }, { "epoch": 0.2144761904761905, "grad_norm": 0.1044921875, "learning_rate": 0.1, "loss": 2.138582468032837, "step": 6756 }, { "epoch": 0.21453968253968253, "grad_norm": 0.1357421875, "learning_rate": 0.1, "loss": 2.1238245964050293, "step": 6758 }, { "epoch": 0.2146031746031746, "grad_norm": 0.15625, "learning_rate": 0.1, "loss": 2.1641857624053955, "step": 6760 }, { "epoch": 0.21466666666666667, "grad_norm": 0.123046875, "learning_rate": 0.1, "loss": 2.1528782844543457, "step": 6762 }, { "epoch": 0.21473015873015874, "grad_norm": 0.14453125, "learning_rate": 0.1, "loss": 2.115715742111206, "step": 6764 }, { "epoch": 0.2147936507936508, "grad_norm": 0.21875, "learning_rate": 0.1, "loss": 2.174689292907715, "step": 6766 }, { "epoch": 0.21485714285714286, "grad_norm": 0.1484375, "learning_rate": 0.1, "loss": 2.153226375579834, "step": 6768 }, { "epoch": 0.21492063492063493, "grad_norm": 0.185546875, "learning_rate": 0.1, "loss": 2.175689220428467, "step": 6770 }, { "epoch": 0.21498412698412697, "grad_norm": 0.09912109375, "learning_rate": 0.1, "loss": 2.1236414909362793, "step": 6772 }, { "epoch": 0.21504761904761904, "grad_norm": 0.23046875, "learning_rate": 0.1, "loss": 2.137789249420166, "step": 6774 }, { "epoch": 0.21511111111111111, "grad_norm": 0.32421875, "learning_rate": 0.1, "loss": 2.1604502201080322, "step": 6776 }, { "epoch": 0.21517460317460319, "grad_norm": 0.248046875, "learning_rate": 0.1, "loss": 2.1521918773651123, "step": 6778 }, { "epoch": 0.21523809523809523, "grad_norm": 0.06640625, "learning_rate": 0.1, "loss": 2.1435136795043945, "step": 6780 }, { "epoch": 0.2153015873015873, "grad_norm": 0.11962890625, "learning_rate": 0.1, "loss": 2.146631956100464, "step": 6782 }, { "epoch": 0.21536507936507937, "grad_norm": 0.302734375, "learning_rate": 0.1, "loss": 2.1651358604431152, "step": 6784 }, { "epoch": 0.21542857142857144, "grad_norm": 0.12353515625, "learning_rate": 0.1, "loss": 2.126324415206909, "step": 6786 }, { "epoch": 0.21549206349206348, "grad_norm": 0.1396484375, "learning_rate": 0.1, "loss": 2.1563475131988525, "step": 6788 }, { "epoch": 0.21555555555555556, "grad_norm": 0.1484375, "learning_rate": 0.1, "loss": 2.133108615875244, "step": 6790 }, { "epoch": 0.21561904761904763, "grad_norm": 0.2734375, "learning_rate": 0.1, "loss": 2.1371994018554688, "step": 6792 }, { "epoch": 0.21568253968253967, "grad_norm": 0.0927734375, "learning_rate": 0.1, "loss": 2.16325306892395, "step": 6794 }, { "epoch": 0.21574603174603174, "grad_norm": 0.0576171875, "learning_rate": 0.1, "loss": 2.1511008739471436, "step": 6796 }, { "epoch": 0.2158095238095238, "grad_norm": 0.140625, "learning_rate": 0.1, "loss": 2.152240753173828, "step": 6798 }, { "epoch": 0.21587301587301588, "grad_norm": 0.0693359375, "learning_rate": 0.1, "loss": 2.1599864959716797, "step": 6800 }, { "epoch": 0.21593650793650793, "grad_norm": 0.166015625, "learning_rate": 0.1, "loss": 2.1392788887023926, "step": 6802 }, { "epoch": 0.216, "grad_norm": 0.1923828125, "learning_rate": 0.1, "loss": 2.1764185428619385, "step": 6804 }, { "epoch": 0.21606349206349207, "grad_norm": 0.177734375, "learning_rate": 0.1, "loss": 2.137763500213623, "step": 6806 }, { "epoch": 0.21612698412698414, "grad_norm": 0.1279296875, "learning_rate": 0.1, "loss": 2.1622676849365234, "step": 6808 }, { "epoch": 0.21619047619047618, "grad_norm": 0.09375, "learning_rate": 0.1, "loss": 2.1416285037994385, "step": 6810 }, { "epoch": 0.21625396825396825, "grad_norm": 0.2158203125, "learning_rate": 0.1, "loss": 2.1306066513061523, "step": 6812 }, { "epoch": 0.21631746031746032, "grad_norm": 0.10791015625, "learning_rate": 0.1, "loss": 2.144252300262451, "step": 6814 }, { "epoch": 0.21638095238095237, "grad_norm": 0.09033203125, "learning_rate": 0.1, "loss": 2.154329538345337, "step": 6816 }, { "epoch": 0.21644444444444444, "grad_norm": 0.134765625, "learning_rate": 0.1, "loss": 2.135835886001587, "step": 6818 }, { "epoch": 0.2165079365079365, "grad_norm": 0.10693359375, "learning_rate": 0.1, "loss": 2.1357626914978027, "step": 6820 }, { "epoch": 0.21657142857142858, "grad_norm": 0.318359375, "learning_rate": 0.1, "loss": 2.1362857818603516, "step": 6822 }, { "epoch": 0.21663492063492062, "grad_norm": 0.318359375, "learning_rate": 0.1, "loss": 2.144066572189331, "step": 6824 }, { "epoch": 0.2166984126984127, "grad_norm": 0.1630859375, "learning_rate": 0.1, "loss": 2.136061191558838, "step": 6826 }, { "epoch": 0.21676190476190477, "grad_norm": 0.087890625, "learning_rate": 0.1, "loss": 2.155639410018921, "step": 6828 }, { "epoch": 0.21682539682539684, "grad_norm": 0.06640625, "learning_rate": 0.1, "loss": 2.1442878246307373, "step": 6830 }, { "epoch": 0.21688888888888888, "grad_norm": 0.11279296875, "learning_rate": 0.1, "loss": 2.1070637702941895, "step": 6832 }, { "epoch": 0.21695238095238095, "grad_norm": 0.19140625, "learning_rate": 0.1, "loss": 2.136019706726074, "step": 6834 }, { "epoch": 0.21701587301587302, "grad_norm": 0.37890625, "learning_rate": 0.1, "loss": 2.1161301136016846, "step": 6836 }, { "epoch": 0.21707936507936507, "grad_norm": 0.267578125, "learning_rate": 0.1, "loss": 2.1276957988739014, "step": 6838 }, { "epoch": 0.21714285714285714, "grad_norm": 0.1240234375, "learning_rate": 0.1, "loss": 2.1618661880493164, "step": 6840 }, { "epoch": 0.2172063492063492, "grad_norm": 0.1962890625, "learning_rate": 0.1, "loss": 2.1654696464538574, "step": 6842 }, { "epoch": 0.21726984126984128, "grad_norm": 0.11474609375, "learning_rate": 0.1, "loss": 2.140983819961548, "step": 6844 }, { "epoch": 0.21733333333333332, "grad_norm": 0.12060546875, "learning_rate": 0.1, "loss": 2.1220474243164062, "step": 6846 }, { "epoch": 0.2173968253968254, "grad_norm": 0.10595703125, "learning_rate": 0.1, "loss": 2.125519275665283, "step": 6848 }, { "epoch": 0.21746031746031746, "grad_norm": 0.134765625, "learning_rate": 0.1, "loss": 2.1399195194244385, "step": 6850 }, { "epoch": 0.21752380952380954, "grad_norm": 0.2490234375, "learning_rate": 0.1, "loss": 2.1371569633483887, "step": 6852 }, { "epoch": 0.21758730158730158, "grad_norm": 0.11767578125, "learning_rate": 0.1, "loss": 2.1230525970458984, "step": 6854 }, { "epoch": 0.21765079365079365, "grad_norm": 0.171875, "learning_rate": 0.1, "loss": 2.14467191696167, "step": 6856 }, { "epoch": 0.21771428571428572, "grad_norm": 0.1044921875, "learning_rate": 0.1, "loss": 2.1237266063690186, "step": 6858 }, { "epoch": 0.21777777777777776, "grad_norm": 0.173828125, "learning_rate": 0.1, "loss": 2.1377322673797607, "step": 6860 }, { "epoch": 0.21784126984126984, "grad_norm": 0.455078125, "learning_rate": 0.1, "loss": 2.13993239402771, "step": 6862 }, { "epoch": 0.2179047619047619, "grad_norm": 0.1728515625, "learning_rate": 0.1, "loss": 2.126100778579712, "step": 6864 }, { "epoch": 0.21796825396825398, "grad_norm": 0.0654296875, "learning_rate": 0.1, "loss": 2.139137029647827, "step": 6866 }, { "epoch": 0.21803174603174602, "grad_norm": 0.12158203125, "learning_rate": 0.1, "loss": 2.0957655906677246, "step": 6868 }, { "epoch": 0.2180952380952381, "grad_norm": 0.0615234375, "learning_rate": 0.1, "loss": 2.130746603012085, "step": 6870 }, { "epoch": 0.21815873015873016, "grad_norm": 0.078125, "learning_rate": 0.1, "loss": 2.1581666469573975, "step": 6872 }, { "epoch": 0.21822222222222223, "grad_norm": 0.044921875, "learning_rate": 0.1, "loss": 2.1414363384246826, "step": 6874 }, { "epoch": 0.21828571428571428, "grad_norm": 0.140625, "learning_rate": 0.1, "loss": 2.1115469932556152, "step": 6876 }, { "epoch": 0.21834920634920635, "grad_norm": 0.19140625, "learning_rate": 0.1, "loss": 2.134005546569824, "step": 6878 }, { "epoch": 0.21841269841269842, "grad_norm": 0.13671875, "learning_rate": 0.1, "loss": 2.114476203918457, "step": 6880 }, { "epoch": 0.21847619047619046, "grad_norm": 0.1484375, "learning_rate": 0.1, "loss": 2.124354124069214, "step": 6882 }, { "epoch": 0.21853968253968253, "grad_norm": 0.1826171875, "learning_rate": 0.1, "loss": 2.1224350929260254, "step": 6884 }, { "epoch": 0.2186031746031746, "grad_norm": 0.2177734375, "learning_rate": 0.1, "loss": 2.15765380859375, "step": 6886 }, { "epoch": 0.21866666666666668, "grad_norm": 0.365234375, "learning_rate": 0.1, "loss": 2.1117336750030518, "step": 6888 }, { "epoch": 0.21873015873015872, "grad_norm": 0.0869140625, "learning_rate": 0.1, "loss": 2.1285619735717773, "step": 6890 }, { "epoch": 0.2187936507936508, "grad_norm": 0.072265625, "learning_rate": 0.1, "loss": 2.1604788303375244, "step": 6892 }, { "epoch": 0.21885714285714286, "grad_norm": 0.046875, "learning_rate": 0.1, "loss": 2.129450559616089, "step": 6894 }, { "epoch": 0.21892063492063493, "grad_norm": 0.1259765625, "learning_rate": 0.1, "loss": 2.1165854930877686, "step": 6896 }, { "epoch": 0.21898412698412698, "grad_norm": 0.173828125, "learning_rate": 0.1, "loss": 2.123339891433716, "step": 6898 }, { "epoch": 0.21904761904761905, "grad_norm": 0.0947265625, "learning_rate": 0.1, "loss": 2.1395325660705566, "step": 6900 }, { "epoch": 0.21911111111111112, "grad_norm": 0.205078125, "learning_rate": 0.1, "loss": 2.132673978805542, "step": 6902 }, { "epoch": 0.21917460317460316, "grad_norm": 0.396484375, "learning_rate": 0.1, "loss": 2.138701915740967, "step": 6904 }, { "epoch": 0.21923809523809523, "grad_norm": 0.349609375, "learning_rate": 0.1, "loss": 2.1500210762023926, "step": 6906 }, { "epoch": 0.2193015873015873, "grad_norm": 0.09326171875, "learning_rate": 0.1, "loss": 2.1671550273895264, "step": 6908 }, { "epoch": 0.21936507936507937, "grad_norm": 0.1640625, "learning_rate": 0.1, "loss": 2.166747808456421, "step": 6910 }, { "epoch": 0.21942857142857142, "grad_norm": 0.06982421875, "learning_rate": 0.1, "loss": 2.184004783630371, "step": 6912 }, { "epoch": 0.2194920634920635, "grad_norm": 0.06640625, "learning_rate": 0.1, "loss": 2.153785228729248, "step": 6914 }, { "epoch": 0.21955555555555556, "grad_norm": 0.212890625, "learning_rate": 0.1, "loss": 2.139852523803711, "step": 6916 }, { "epoch": 0.21961904761904763, "grad_norm": 0.15234375, "learning_rate": 0.1, "loss": 2.1561646461486816, "step": 6918 }, { "epoch": 0.21968253968253967, "grad_norm": 0.26953125, "learning_rate": 0.1, "loss": 2.1049249172210693, "step": 6920 }, { "epoch": 0.21974603174603174, "grad_norm": 0.291015625, "learning_rate": 0.1, "loss": 2.150930404663086, "step": 6922 }, { "epoch": 0.21980952380952382, "grad_norm": 0.1181640625, "learning_rate": 0.1, "loss": 2.1360626220703125, "step": 6924 }, { "epoch": 0.2198730158730159, "grad_norm": 0.140625, "learning_rate": 0.1, "loss": 2.13289475440979, "step": 6926 }, { "epoch": 0.21993650793650793, "grad_norm": 0.1171875, "learning_rate": 0.1, "loss": 2.1386618614196777, "step": 6928 }, { "epoch": 0.22, "grad_norm": 0.1474609375, "learning_rate": 0.1, "loss": 2.132570505142212, "step": 6930 }, { "epoch": 0.22006349206349207, "grad_norm": 0.173828125, "learning_rate": 0.1, "loss": 2.153592109680176, "step": 6932 }, { "epoch": 0.22012698412698412, "grad_norm": 0.1240234375, "learning_rate": 0.1, "loss": 2.1296417713165283, "step": 6934 }, { "epoch": 0.2201904761904762, "grad_norm": 0.0498046875, "learning_rate": 0.1, "loss": 2.174931764602661, "step": 6936 }, { "epoch": 0.22025396825396826, "grad_norm": 0.068359375, "learning_rate": 0.1, "loss": 2.1546332836151123, "step": 6938 }, { "epoch": 0.22031746031746033, "grad_norm": 0.1318359375, "learning_rate": 0.1, "loss": 2.152759313583374, "step": 6940 }, { "epoch": 0.22038095238095237, "grad_norm": 0.2294921875, "learning_rate": 0.1, "loss": 2.1379473209381104, "step": 6942 }, { "epoch": 0.22044444444444444, "grad_norm": 0.306640625, "learning_rate": 0.1, "loss": 2.1671106815338135, "step": 6944 }, { "epoch": 0.22050793650793651, "grad_norm": 0.310546875, "learning_rate": 0.1, "loss": 2.1411094665527344, "step": 6946 }, { "epoch": 0.22057142857142858, "grad_norm": 0.06982421875, "learning_rate": 0.1, "loss": 2.147550344467163, "step": 6948 }, { "epoch": 0.22063492063492063, "grad_norm": 0.193359375, "learning_rate": 0.1, "loss": 2.1512222290039062, "step": 6950 }, { "epoch": 0.2206984126984127, "grad_norm": 0.09765625, "learning_rate": 0.1, "loss": 2.1380372047424316, "step": 6952 }, { "epoch": 0.22076190476190477, "grad_norm": 0.07763671875, "learning_rate": 0.1, "loss": 2.128605365753174, "step": 6954 }, { "epoch": 0.2208253968253968, "grad_norm": 0.287109375, "learning_rate": 0.1, "loss": 2.131682872772217, "step": 6956 }, { "epoch": 0.22088888888888888, "grad_norm": 0.396484375, "learning_rate": 0.1, "loss": 2.1413862705230713, "step": 6958 }, { "epoch": 0.22095238095238096, "grad_norm": 0.11083984375, "learning_rate": 0.1, "loss": 2.119349956512451, "step": 6960 }, { "epoch": 0.22101587301587303, "grad_norm": 0.1552734375, "learning_rate": 0.1, "loss": 2.132967472076416, "step": 6962 }, { "epoch": 0.22107936507936507, "grad_norm": 0.1767578125, "learning_rate": 0.1, "loss": 2.1547927856445312, "step": 6964 }, { "epoch": 0.22114285714285714, "grad_norm": 0.10595703125, "learning_rate": 0.1, "loss": 2.1142804622650146, "step": 6966 }, { "epoch": 0.2212063492063492, "grad_norm": 0.2265625, "learning_rate": 0.1, "loss": 2.1691954135894775, "step": 6968 }, { "epoch": 0.22126984126984128, "grad_norm": 0.060546875, "learning_rate": 0.1, "loss": 2.152580499649048, "step": 6970 }, { "epoch": 0.22133333333333333, "grad_norm": 0.08154296875, "learning_rate": 0.1, "loss": 2.1411333084106445, "step": 6972 }, { "epoch": 0.2213968253968254, "grad_norm": 0.1015625, "learning_rate": 0.1, "loss": 2.143845558166504, "step": 6974 }, { "epoch": 0.22146031746031747, "grad_norm": 0.04736328125, "learning_rate": 0.1, "loss": 2.1259186267852783, "step": 6976 }, { "epoch": 0.2215238095238095, "grad_norm": 0.08447265625, "learning_rate": 0.1, "loss": 2.132357358932495, "step": 6978 }, { "epoch": 0.22158730158730158, "grad_norm": 0.31640625, "learning_rate": 0.1, "loss": 2.1286840438842773, "step": 6980 }, { "epoch": 0.22165079365079365, "grad_norm": 0.09375, "learning_rate": 0.1, "loss": 2.1343026161193848, "step": 6982 }, { "epoch": 0.22171428571428572, "grad_norm": 0.228515625, "learning_rate": 0.1, "loss": 2.130988359451294, "step": 6984 }, { "epoch": 0.22177777777777777, "grad_norm": 0.1474609375, "learning_rate": 0.1, "loss": 2.1570754051208496, "step": 6986 }, { "epoch": 0.22184126984126984, "grad_norm": 0.205078125, "learning_rate": 0.1, "loss": 2.1592862606048584, "step": 6988 }, { "epoch": 0.2219047619047619, "grad_norm": 0.2490234375, "learning_rate": 0.1, "loss": 2.153437852859497, "step": 6990 }, { "epoch": 0.22196825396825398, "grad_norm": 0.236328125, "learning_rate": 0.1, "loss": 2.1286792755126953, "step": 6992 }, { "epoch": 0.22203174603174602, "grad_norm": 0.1044921875, "learning_rate": 0.1, "loss": 2.162008762359619, "step": 6994 }, { "epoch": 0.2220952380952381, "grad_norm": 0.162109375, "learning_rate": 0.1, "loss": 2.13472056388855, "step": 6996 }, { "epoch": 0.22215873015873017, "grad_norm": 0.11669921875, "learning_rate": 0.1, "loss": 2.157270908355713, "step": 6998 }, { "epoch": 0.2222222222222222, "grad_norm": 0.0751953125, "learning_rate": 0.1, "loss": 2.1538546085357666, "step": 7000 }, { "epoch": 0.22228571428571428, "grad_norm": 0.1298828125, "learning_rate": 0.1, "loss": 2.159203052520752, "step": 7002 }, { "epoch": 0.22234920634920635, "grad_norm": 0.197265625, "learning_rate": 0.1, "loss": 2.1209523677825928, "step": 7004 }, { "epoch": 0.22241269841269842, "grad_norm": 0.06201171875, "learning_rate": 0.1, "loss": 2.1160812377929688, "step": 7006 }, { "epoch": 0.22247619047619047, "grad_norm": 0.2333984375, "learning_rate": 0.1, "loss": 2.1593992710113525, "step": 7008 }, { "epoch": 0.22253968253968254, "grad_norm": 0.291015625, "learning_rate": 0.1, "loss": 2.163609504699707, "step": 7010 }, { "epoch": 0.2226031746031746, "grad_norm": 0.07373046875, "learning_rate": 0.1, "loss": 2.1450819969177246, "step": 7012 }, { "epoch": 0.22266666666666668, "grad_norm": 0.0751953125, "learning_rate": 0.1, "loss": 2.139927625656128, "step": 7014 }, { "epoch": 0.22273015873015872, "grad_norm": 0.1181640625, "learning_rate": 0.1, "loss": 2.1318132877349854, "step": 7016 }, { "epoch": 0.2227936507936508, "grad_norm": 0.244140625, "learning_rate": 0.1, "loss": 2.153622627258301, "step": 7018 }, { "epoch": 0.22285714285714286, "grad_norm": 0.1787109375, "learning_rate": 0.1, "loss": 2.1857666969299316, "step": 7020 }, { "epoch": 0.2229206349206349, "grad_norm": 0.07958984375, "learning_rate": 0.1, "loss": 2.1610965728759766, "step": 7022 }, { "epoch": 0.22298412698412698, "grad_norm": 0.099609375, "learning_rate": 0.1, "loss": 2.1289515495300293, "step": 7024 }, { "epoch": 0.22304761904761905, "grad_norm": 0.294921875, "learning_rate": 0.1, "loss": 2.1287918090820312, "step": 7026 }, { "epoch": 0.22311111111111112, "grad_norm": 0.30078125, "learning_rate": 0.1, "loss": 2.1367766857147217, "step": 7028 }, { "epoch": 0.22317460317460316, "grad_norm": 0.056396484375, "learning_rate": 0.1, "loss": 2.1566615104675293, "step": 7030 }, { "epoch": 0.22323809523809524, "grad_norm": 0.10791015625, "learning_rate": 0.1, "loss": 2.1559834480285645, "step": 7032 }, { "epoch": 0.2233015873015873, "grad_norm": 0.20703125, "learning_rate": 0.1, "loss": 2.1447644233703613, "step": 7034 }, { "epoch": 0.22336507936507938, "grad_norm": 0.185546875, "learning_rate": 0.1, "loss": 2.114109754562378, "step": 7036 }, { "epoch": 0.22342857142857142, "grad_norm": 0.154296875, "learning_rate": 0.1, "loss": 2.1257081031799316, "step": 7038 }, { "epoch": 0.2234920634920635, "grad_norm": 0.201171875, "learning_rate": 0.1, "loss": 2.1463351249694824, "step": 7040 }, { "epoch": 0.22355555555555556, "grad_norm": 0.2021484375, "learning_rate": 0.1, "loss": 2.1157119274139404, "step": 7042 }, { "epoch": 0.2236190476190476, "grad_norm": 0.06396484375, "learning_rate": 0.1, "loss": 2.1585922241210938, "step": 7044 }, { "epoch": 0.22368253968253968, "grad_norm": 0.054931640625, "learning_rate": 0.1, "loss": 2.171070098876953, "step": 7046 }, { "epoch": 0.22374603174603175, "grad_norm": 0.125, "learning_rate": 0.1, "loss": 2.130720853805542, "step": 7048 }, { "epoch": 0.22380952380952382, "grad_norm": 0.333984375, "learning_rate": 0.1, "loss": 2.149533271789551, "step": 7050 }, { "epoch": 0.22387301587301586, "grad_norm": 0.2109375, "learning_rate": 0.1, "loss": 2.157966375350952, "step": 7052 }, { "epoch": 0.22393650793650793, "grad_norm": 0.185546875, "learning_rate": 0.1, "loss": 2.164947509765625, "step": 7054 }, { "epoch": 0.224, "grad_norm": 0.189453125, "learning_rate": 0.1, "loss": 2.141514778137207, "step": 7056 }, { "epoch": 0.22406349206349208, "grad_norm": 0.2451171875, "learning_rate": 0.1, "loss": 2.101151704788208, "step": 7058 }, { "epoch": 0.22412698412698412, "grad_norm": 0.330078125, "learning_rate": 0.1, "loss": 2.178180694580078, "step": 7060 }, { "epoch": 0.2241904761904762, "grad_norm": 0.2109375, "learning_rate": 0.1, "loss": 2.1435985565185547, "step": 7062 }, { "epoch": 0.22425396825396826, "grad_norm": 0.09130859375, "learning_rate": 0.1, "loss": 2.1563589572906494, "step": 7064 }, { "epoch": 0.2243174603174603, "grad_norm": 0.2421875, "learning_rate": 0.1, "loss": 2.1421165466308594, "step": 7066 }, { "epoch": 0.22438095238095238, "grad_norm": 0.244140625, "learning_rate": 0.1, "loss": 2.141111373901367, "step": 7068 }, { "epoch": 0.22444444444444445, "grad_norm": 0.173828125, "learning_rate": 0.1, "loss": 2.121093511581421, "step": 7070 }, { "epoch": 0.22450793650793652, "grad_norm": 0.1103515625, "learning_rate": 0.1, "loss": 2.152310609817505, "step": 7072 }, { "epoch": 0.22457142857142856, "grad_norm": 0.099609375, "learning_rate": 0.1, "loss": 2.136263847351074, "step": 7074 }, { "epoch": 0.22463492063492063, "grad_norm": 0.0810546875, "learning_rate": 0.1, "loss": 2.1504621505737305, "step": 7076 }, { "epoch": 0.2246984126984127, "grad_norm": 0.0751953125, "learning_rate": 0.1, "loss": 2.126725196838379, "step": 7078 }, { "epoch": 0.22476190476190477, "grad_norm": 0.10546875, "learning_rate": 0.1, "loss": 2.128787040710449, "step": 7080 }, { "epoch": 0.22482539682539682, "grad_norm": 0.125, "learning_rate": 0.1, "loss": 2.1723103523254395, "step": 7082 }, { "epoch": 0.2248888888888889, "grad_norm": 0.14453125, "learning_rate": 0.1, "loss": 2.1483514308929443, "step": 7084 }, { "epoch": 0.22495238095238096, "grad_norm": 0.1259765625, "learning_rate": 0.1, "loss": 2.128434419631958, "step": 7086 }, { "epoch": 0.225015873015873, "grad_norm": 0.2138671875, "learning_rate": 0.1, "loss": 2.1444332599639893, "step": 7088 }, { "epoch": 0.22507936507936507, "grad_norm": 0.279296875, "learning_rate": 0.1, "loss": 2.1513185501098633, "step": 7090 }, { "epoch": 0.22514285714285714, "grad_norm": 0.1982421875, "learning_rate": 0.1, "loss": 2.1210269927978516, "step": 7092 }, { "epoch": 0.22520634920634922, "grad_norm": 0.1533203125, "learning_rate": 0.1, "loss": 2.1175100803375244, "step": 7094 }, { "epoch": 0.22526984126984126, "grad_norm": 0.1337890625, "learning_rate": 0.1, "loss": 2.1589152812957764, "step": 7096 }, { "epoch": 0.22533333333333333, "grad_norm": 0.051025390625, "learning_rate": 0.1, "loss": 2.124692440032959, "step": 7098 }, { "epoch": 0.2253968253968254, "grad_norm": 0.27734375, "learning_rate": 0.1, "loss": 2.130506753921509, "step": 7100 }, { "epoch": 0.22546031746031747, "grad_norm": 0.34375, "learning_rate": 0.1, "loss": 2.122781276702881, "step": 7102 }, { "epoch": 0.22552380952380952, "grad_norm": 0.1982421875, "learning_rate": 0.1, "loss": 2.1266984939575195, "step": 7104 }, { "epoch": 0.2255873015873016, "grad_norm": 0.0830078125, "learning_rate": 0.1, "loss": 2.1258769035339355, "step": 7106 }, { "epoch": 0.22565079365079366, "grad_norm": 0.1767578125, "learning_rate": 0.1, "loss": 2.1037776470184326, "step": 7108 }, { "epoch": 0.2257142857142857, "grad_norm": 0.189453125, "learning_rate": 0.1, "loss": 2.118499755859375, "step": 7110 }, { "epoch": 0.22577777777777777, "grad_norm": 0.134765625, "learning_rate": 0.1, "loss": 2.139988660812378, "step": 7112 }, { "epoch": 0.22584126984126984, "grad_norm": 0.09033203125, "learning_rate": 0.1, "loss": 2.125767707824707, "step": 7114 }, { "epoch": 0.2259047619047619, "grad_norm": 0.11865234375, "learning_rate": 0.1, "loss": 2.1412160396575928, "step": 7116 }, { "epoch": 0.22596825396825396, "grad_norm": 0.05810546875, "learning_rate": 0.1, "loss": 2.13714599609375, "step": 7118 }, { "epoch": 0.22603174603174603, "grad_norm": 0.10009765625, "learning_rate": 0.1, "loss": 2.138188123703003, "step": 7120 }, { "epoch": 0.2260952380952381, "grad_norm": 0.10107421875, "learning_rate": 0.1, "loss": 2.1515491008758545, "step": 7122 }, { "epoch": 0.22615873015873017, "grad_norm": 0.1142578125, "learning_rate": 0.1, "loss": 2.115903615951538, "step": 7124 }, { "epoch": 0.2262222222222222, "grad_norm": 0.11865234375, "learning_rate": 0.1, "loss": 2.1299021244049072, "step": 7126 }, { "epoch": 0.22628571428571428, "grad_norm": 0.201171875, "learning_rate": 0.1, "loss": 2.1392688751220703, "step": 7128 }, { "epoch": 0.22634920634920636, "grad_norm": 0.35546875, "learning_rate": 0.1, "loss": 2.1206765174865723, "step": 7130 }, { "epoch": 0.22641269841269843, "grad_norm": 0.32421875, "learning_rate": 0.1, "loss": 2.111806631088257, "step": 7132 }, { "epoch": 0.22647619047619047, "grad_norm": 0.2158203125, "learning_rate": 0.1, "loss": 2.139885663986206, "step": 7134 }, { "epoch": 0.22653968253968254, "grad_norm": 0.068359375, "learning_rate": 0.1, "loss": 2.1079506874084473, "step": 7136 }, { "epoch": 0.2266031746031746, "grad_norm": 0.10107421875, "learning_rate": 0.1, "loss": 2.0922160148620605, "step": 7138 }, { "epoch": 0.22666666666666666, "grad_norm": 0.0751953125, "learning_rate": 0.1, "loss": 2.1544196605682373, "step": 7140 }, { "epoch": 0.22673015873015873, "grad_norm": 0.224609375, "learning_rate": 0.1, "loss": 2.1479554176330566, "step": 7142 }, { "epoch": 0.2267936507936508, "grad_norm": 0.23046875, "learning_rate": 0.1, "loss": 2.1177079677581787, "step": 7144 }, { "epoch": 0.22685714285714287, "grad_norm": 0.07861328125, "learning_rate": 0.1, "loss": 2.102313756942749, "step": 7146 }, { "epoch": 0.2269206349206349, "grad_norm": 0.06884765625, "learning_rate": 0.1, "loss": 2.081862449645996, "step": 7148 }, { "epoch": 0.22698412698412698, "grad_norm": 0.1494140625, "learning_rate": 0.1, "loss": 2.1289217472076416, "step": 7150 }, { "epoch": 0.22704761904761905, "grad_norm": 0.2890625, "learning_rate": 0.1, "loss": 2.0885682106018066, "step": 7152 }, { "epoch": 0.22711111111111112, "grad_norm": 0.232421875, "learning_rate": 0.1, "loss": 2.1082844734191895, "step": 7154 }, { "epoch": 0.22717460317460317, "grad_norm": 0.1640625, "learning_rate": 0.1, "loss": 2.1205620765686035, "step": 7156 }, { "epoch": 0.22723809523809524, "grad_norm": 0.1787109375, "learning_rate": 0.1, "loss": 2.128849744796753, "step": 7158 }, { "epoch": 0.2273015873015873, "grad_norm": 0.07470703125, "learning_rate": 0.1, "loss": 2.084279775619507, "step": 7160 }, { "epoch": 0.22736507936507935, "grad_norm": 0.2392578125, "learning_rate": 0.1, "loss": 2.097198486328125, "step": 7162 }, { "epoch": 0.22742857142857142, "grad_norm": 0.12158203125, "learning_rate": 0.1, "loss": 2.088974952697754, "step": 7164 }, { "epoch": 0.2274920634920635, "grad_norm": 0.07373046875, "learning_rate": 0.1, "loss": 2.09980845451355, "step": 7166 }, { "epoch": 0.22755555555555557, "grad_norm": 0.2080078125, "learning_rate": 0.1, "loss": 2.080089569091797, "step": 7168 }, { "epoch": 0.2276190476190476, "grad_norm": 0.2353515625, "learning_rate": 0.1, "loss": 2.1099812984466553, "step": 7170 }, { "epoch": 0.22768253968253968, "grad_norm": 0.09619140625, "learning_rate": 0.1, "loss": 2.1236658096313477, "step": 7172 }, { "epoch": 0.22774603174603175, "grad_norm": 0.2158203125, "learning_rate": 0.1, "loss": 2.122100830078125, "step": 7174 }, { "epoch": 0.22780952380952382, "grad_norm": 0.1787109375, "learning_rate": 0.1, "loss": 2.123049736022949, "step": 7176 }, { "epoch": 0.22787301587301587, "grad_norm": 0.11328125, "learning_rate": 0.1, "loss": 2.094006061553955, "step": 7178 }, { "epoch": 0.22793650793650794, "grad_norm": 0.05419921875, "learning_rate": 0.1, "loss": 2.116697072982788, "step": 7180 }, { "epoch": 0.228, "grad_norm": 0.15625, "learning_rate": 0.1, "loss": 2.137547731399536, "step": 7182 }, { "epoch": 0.22806349206349205, "grad_norm": 0.10400390625, "learning_rate": 0.1, "loss": 2.1234238147735596, "step": 7184 }, { "epoch": 0.22812698412698412, "grad_norm": 0.111328125, "learning_rate": 0.1, "loss": 2.1170268058776855, "step": 7186 }, { "epoch": 0.2281904761904762, "grad_norm": 0.265625, "learning_rate": 0.1, "loss": 2.12412428855896, "step": 7188 }, { "epoch": 0.22825396825396826, "grad_norm": 0.181640625, "learning_rate": 0.1, "loss": 2.1083738803863525, "step": 7190 }, { "epoch": 0.2283174603174603, "grad_norm": 0.06591796875, "learning_rate": 0.1, "loss": 2.1555607318878174, "step": 7192 }, { "epoch": 0.22838095238095238, "grad_norm": 0.169921875, "learning_rate": 0.1, "loss": 2.0828068256378174, "step": 7194 }, { "epoch": 0.22844444444444445, "grad_norm": 0.05615234375, "learning_rate": 0.1, "loss": 2.1087217330932617, "step": 7196 }, { "epoch": 0.22850793650793652, "grad_norm": 0.283203125, "learning_rate": 0.1, "loss": 2.0941569805145264, "step": 7198 }, { "epoch": 0.22857142857142856, "grad_norm": 0.41796875, "learning_rate": 0.1, "loss": 2.099388837814331, "step": 7200 }, { "epoch": 0.22863492063492064, "grad_norm": 0.171875, "learning_rate": 0.1, "loss": 2.1068103313446045, "step": 7202 }, { "epoch": 0.2286984126984127, "grad_norm": 0.314453125, "learning_rate": 0.1, "loss": 2.098156213760376, "step": 7204 }, { "epoch": 0.22876190476190475, "grad_norm": 0.0703125, "learning_rate": 0.1, "loss": 2.109511613845825, "step": 7206 }, { "epoch": 0.22882539682539682, "grad_norm": 0.11865234375, "learning_rate": 0.1, "loss": 2.0991880893707275, "step": 7208 }, { "epoch": 0.2288888888888889, "grad_norm": 0.12890625, "learning_rate": 0.1, "loss": 2.1275899410247803, "step": 7210 }, { "epoch": 0.22895238095238096, "grad_norm": 0.185546875, "learning_rate": 0.1, "loss": 2.1142795085906982, "step": 7212 }, { "epoch": 0.229015873015873, "grad_norm": 0.053466796875, "learning_rate": 0.1, "loss": 2.108628511428833, "step": 7214 }, { "epoch": 0.22907936507936508, "grad_norm": 0.05859375, "learning_rate": 0.1, "loss": 2.0655782222747803, "step": 7216 }, { "epoch": 0.22914285714285715, "grad_norm": 0.279296875, "learning_rate": 0.1, "loss": 2.1037392616271973, "step": 7218 }, { "epoch": 0.22920634920634922, "grad_norm": 0.26171875, "learning_rate": 0.1, "loss": 2.0752387046813965, "step": 7220 }, { "epoch": 0.22926984126984126, "grad_norm": 0.062255859375, "learning_rate": 0.1, "loss": 2.125493288040161, "step": 7222 }, { "epoch": 0.22933333333333333, "grad_norm": 0.0693359375, "learning_rate": 0.1, "loss": 2.0617048740386963, "step": 7224 }, { "epoch": 0.2293968253968254, "grad_norm": 0.0791015625, "learning_rate": 0.1, "loss": 2.097587823867798, "step": 7226 }, { "epoch": 0.22946031746031745, "grad_norm": 0.177734375, "learning_rate": 0.1, "loss": 2.08406925201416, "step": 7228 }, { "epoch": 0.22952380952380952, "grad_norm": 0.10546875, "learning_rate": 0.1, "loss": 2.0743837356567383, "step": 7230 }, { "epoch": 0.2295873015873016, "grad_norm": 0.203125, "learning_rate": 0.1, "loss": 2.102339506149292, "step": 7232 }, { "epoch": 0.22965079365079366, "grad_norm": 0.1435546875, "learning_rate": 0.1, "loss": 2.0809457302093506, "step": 7234 }, { "epoch": 0.2297142857142857, "grad_norm": 0.2294921875, "learning_rate": 0.1, "loss": 2.059857130050659, "step": 7236 }, { "epoch": 0.22977777777777778, "grad_norm": 0.32421875, "learning_rate": 0.1, "loss": 2.0958189964294434, "step": 7238 }, { "epoch": 0.22984126984126985, "grad_norm": 0.07421875, "learning_rate": 0.1, "loss": 2.073896646499634, "step": 7240 }, { "epoch": 0.22990476190476192, "grad_norm": 0.11474609375, "learning_rate": 0.1, "loss": 2.1097676753997803, "step": 7242 }, { "epoch": 0.22996825396825396, "grad_norm": 0.2734375, "learning_rate": 0.1, "loss": 2.0588741302490234, "step": 7244 }, { "epoch": 0.23003174603174603, "grad_norm": 0.10107421875, "learning_rate": 0.1, "loss": 2.0870587825775146, "step": 7246 }, { "epoch": 0.2300952380952381, "grad_norm": 0.10986328125, "learning_rate": 0.1, "loss": 2.0808043479919434, "step": 7248 }, { "epoch": 0.23015873015873015, "grad_norm": 0.146484375, "learning_rate": 0.1, "loss": 2.0871644020080566, "step": 7250 }, { "epoch": 0.23022222222222222, "grad_norm": 0.28125, "learning_rate": 0.1, "loss": 2.0993053913116455, "step": 7252 }, { "epoch": 0.2302857142857143, "grad_norm": 0.1455078125, "learning_rate": 0.1, "loss": 2.1009299755096436, "step": 7254 }, { "epoch": 0.23034920634920636, "grad_norm": 0.1611328125, "learning_rate": 0.1, "loss": 2.1019914150238037, "step": 7256 }, { "epoch": 0.2304126984126984, "grad_norm": 0.126953125, "learning_rate": 0.1, "loss": 2.0885250568389893, "step": 7258 }, { "epoch": 0.23047619047619047, "grad_norm": 0.1611328125, "learning_rate": 0.1, "loss": 2.1021342277526855, "step": 7260 }, { "epoch": 0.23053968253968254, "grad_norm": 0.1552734375, "learning_rate": 0.1, "loss": 2.0971949100494385, "step": 7262 }, { "epoch": 0.23060317460317462, "grad_norm": 0.1640625, "learning_rate": 0.1, "loss": 2.100215196609497, "step": 7264 }, { "epoch": 0.23066666666666666, "grad_norm": 0.1396484375, "learning_rate": 0.1, "loss": 2.1143314838409424, "step": 7266 }, { "epoch": 0.23073015873015873, "grad_norm": 0.2734375, "learning_rate": 0.1, "loss": 2.0838236808776855, "step": 7268 }, { "epoch": 0.2307936507936508, "grad_norm": 0.20703125, "learning_rate": 0.1, "loss": 2.0889368057250977, "step": 7270 }, { "epoch": 0.23085714285714284, "grad_norm": 0.2138671875, "learning_rate": 0.1, "loss": 2.073298454284668, "step": 7272 }, { "epoch": 0.23092063492063492, "grad_norm": 0.1455078125, "learning_rate": 0.1, "loss": 2.0853874683380127, "step": 7274 }, { "epoch": 0.230984126984127, "grad_norm": 0.12109375, "learning_rate": 0.1, "loss": 2.07358717918396, "step": 7276 }, { "epoch": 0.23104761904761906, "grad_norm": 0.171875, "learning_rate": 0.1, "loss": 2.0914080142974854, "step": 7278 }, { "epoch": 0.2311111111111111, "grad_norm": 0.0673828125, "learning_rate": 0.1, "loss": 2.104854106903076, "step": 7280 }, { "epoch": 0.23117460317460317, "grad_norm": 0.051513671875, "learning_rate": 0.1, "loss": 2.080443859100342, "step": 7282 }, { "epoch": 0.23123809523809524, "grad_norm": 0.0791015625, "learning_rate": 0.1, "loss": 2.071145534515381, "step": 7284 }, { "epoch": 0.2313015873015873, "grad_norm": 0.1865234375, "learning_rate": 0.1, "loss": 2.039267063140869, "step": 7286 }, { "epoch": 0.23136507936507936, "grad_norm": 0.1318359375, "learning_rate": 0.1, "loss": 2.0707762241363525, "step": 7288 }, { "epoch": 0.23142857142857143, "grad_norm": 0.10888671875, "learning_rate": 0.1, "loss": 2.0882930755615234, "step": 7290 }, { "epoch": 0.2314920634920635, "grad_norm": 0.1298828125, "learning_rate": 0.1, "loss": 2.077226400375366, "step": 7292 }, { "epoch": 0.23155555555555554, "grad_norm": 0.08349609375, "learning_rate": 0.1, "loss": 2.0640923976898193, "step": 7294 }, { "epoch": 0.2316190476190476, "grad_norm": 0.2578125, "learning_rate": 0.1, "loss": 2.0957696437835693, "step": 7296 }, { "epoch": 0.23168253968253968, "grad_norm": 0.2119140625, "learning_rate": 0.1, "loss": 2.090580463409424, "step": 7298 }, { "epoch": 0.23174603174603176, "grad_norm": 0.1884765625, "learning_rate": 0.1, "loss": 2.0697145462036133, "step": 7300 }, { "epoch": 0.2318095238095238, "grad_norm": 0.287109375, "learning_rate": 0.1, "loss": 2.029175043106079, "step": 7302 }, { "epoch": 0.23187301587301587, "grad_norm": 0.1865234375, "learning_rate": 0.1, "loss": 2.071197271347046, "step": 7304 }, { "epoch": 0.23193650793650794, "grad_norm": 0.224609375, "learning_rate": 0.1, "loss": 2.053938388824463, "step": 7306 }, { "epoch": 0.232, "grad_norm": 0.263671875, "learning_rate": 0.1, "loss": 2.0855093002319336, "step": 7308 }, { "epoch": 0.23206349206349206, "grad_norm": 0.1494140625, "learning_rate": 0.1, "loss": 2.0687525272369385, "step": 7310 }, { "epoch": 0.23212698412698413, "grad_norm": 0.07177734375, "learning_rate": 0.1, "loss": 2.0972952842712402, "step": 7312 }, { "epoch": 0.2321904761904762, "grad_norm": 0.05322265625, "learning_rate": 0.1, "loss": 2.075571298599243, "step": 7314 }, { "epoch": 0.23225396825396824, "grad_norm": 0.1259765625, "learning_rate": 0.1, "loss": 2.0745177268981934, "step": 7316 }, { "epoch": 0.2323174603174603, "grad_norm": 0.1357421875, "learning_rate": 0.1, "loss": 2.0768115520477295, "step": 7318 }, { "epoch": 0.23238095238095238, "grad_norm": 0.263671875, "learning_rate": 0.1, "loss": 2.0695269107818604, "step": 7320 }, { "epoch": 0.23244444444444445, "grad_norm": 0.388671875, "learning_rate": 0.1, "loss": 2.0749969482421875, "step": 7322 }, { "epoch": 0.2325079365079365, "grad_norm": 0.0791015625, "learning_rate": 0.1, "loss": 2.0699052810668945, "step": 7324 }, { "epoch": 0.23257142857142857, "grad_norm": 0.055908203125, "learning_rate": 0.1, "loss": 2.037580966949463, "step": 7326 }, { "epoch": 0.23263492063492064, "grad_norm": 0.0888671875, "learning_rate": 0.1, "loss": 2.060831308364868, "step": 7328 }, { "epoch": 0.2326984126984127, "grad_norm": 0.1904296875, "learning_rate": 0.1, "loss": 2.086434841156006, "step": 7330 }, { "epoch": 0.23276190476190475, "grad_norm": 0.2109375, "learning_rate": 0.1, "loss": 2.0842316150665283, "step": 7332 }, { "epoch": 0.23282539682539682, "grad_norm": 0.2578125, "learning_rate": 0.1, "loss": 2.0805346965789795, "step": 7334 }, { "epoch": 0.2328888888888889, "grad_norm": 0.251953125, "learning_rate": 0.1, "loss": 2.046854257583618, "step": 7336 }, { "epoch": 0.23295238095238094, "grad_norm": 0.05224609375, "learning_rate": 0.1, "loss": 2.1270952224731445, "step": 7338 }, { "epoch": 0.233015873015873, "grad_norm": 0.11328125, "learning_rate": 0.1, "loss": 2.08383846282959, "step": 7340 }, { "epoch": 0.23307936507936508, "grad_norm": 0.1083984375, "learning_rate": 0.1, "loss": 2.0640437602996826, "step": 7342 }, { "epoch": 0.23314285714285715, "grad_norm": 0.07568359375, "learning_rate": 0.1, "loss": 2.0862276554107666, "step": 7344 }, { "epoch": 0.2332063492063492, "grad_norm": 0.0654296875, "learning_rate": 0.1, "loss": 2.1018731594085693, "step": 7346 }, { "epoch": 0.23326984126984127, "grad_norm": 0.11474609375, "learning_rate": 0.1, "loss": 2.0996389389038086, "step": 7348 }, { "epoch": 0.23333333333333334, "grad_norm": 0.10498046875, "learning_rate": 0.1, "loss": 2.096540689468384, "step": 7350 }, { "epoch": 0.2333968253968254, "grad_norm": 0.057373046875, "learning_rate": 0.1, "loss": 2.101809024810791, "step": 7352 }, { "epoch": 0.23346031746031745, "grad_norm": 0.388671875, "learning_rate": 0.1, "loss": 2.097398281097412, "step": 7354 }, { "epoch": 0.23352380952380952, "grad_norm": 0.359375, "learning_rate": 0.1, "loss": 2.1124465465545654, "step": 7356 }, { "epoch": 0.2335873015873016, "grad_norm": 0.169921875, "learning_rate": 0.1, "loss": 2.0534331798553467, "step": 7358 }, { "epoch": 0.23365079365079366, "grad_norm": 0.03857421875, "learning_rate": 0.1, "loss": 2.08892822265625, "step": 7360 }, { "epoch": 0.2337142857142857, "grad_norm": 0.060791015625, "learning_rate": 0.1, "loss": 2.090662956237793, "step": 7362 }, { "epoch": 0.23377777777777778, "grad_norm": 0.06689453125, "learning_rate": 0.1, "loss": 2.1300923824310303, "step": 7364 }, { "epoch": 0.23384126984126985, "grad_norm": 0.08056640625, "learning_rate": 0.1, "loss": 2.0773069858551025, "step": 7366 }, { "epoch": 0.2339047619047619, "grad_norm": 0.07421875, "learning_rate": 0.1, "loss": 2.0950794219970703, "step": 7368 }, { "epoch": 0.23396825396825396, "grad_norm": 0.0830078125, "learning_rate": 0.1, "loss": 2.0837361812591553, "step": 7370 }, { "epoch": 0.23403174603174604, "grad_norm": 0.09228515625, "learning_rate": 0.1, "loss": 2.108111619949341, "step": 7372 }, { "epoch": 0.2340952380952381, "grad_norm": 0.435546875, "learning_rate": 0.1, "loss": 2.0785460472106934, "step": 7374 }, { "epoch": 0.23415873015873015, "grad_norm": 0.37109375, "learning_rate": 0.1, "loss": 2.1281676292419434, "step": 7376 }, { "epoch": 0.23422222222222222, "grad_norm": 0.12353515625, "learning_rate": 0.1, "loss": 2.0628724098205566, "step": 7378 }, { "epoch": 0.2342857142857143, "grad_norm": 0.11181640625, "learning_rate": 0.1, "loss": 2.097722291946411, "step": 7380 }, { "epoch": 0.23434920634920636, "grad_norm": 0.0771484375, "learning_rate": 0.1, "loss": 2.1138598918914795, "step": 7382 }, { "epoch": 0.2344126984126984, "grad_norm": 0.07666015625, "learning_rate": 0.1, "loss": 2.0872881412506104, "step": 7384 }, { "epoch": 0.23447619047619048, "grad_norm": 0.1923828125, "learning_rate": 0.1, "loss": 2.118544101715088, "step": 7386 }, { "epoch": 0.23453968253968255, "grad_norm": 0.171875, "learning_rate": 0.1, "loss": 2.1037282943725586, "step": 7388 }, { "epoch": 0.2346031746031746, "grad_norm": 0.228515625, "learning_rate": 0.1, "loss": 2.095290422439575, "step": 7390 }, { "epoch": 0.23466666666666666, "grad_norm": 0.06396484375, "learning_rate": 0.1, "loss": 2.0770010948181152, "step": 7392 }, { "epoch": 0.23473015873015873, "grad_norm": 0.1728515625, "learning_rate": 0.1, "loss": 2.115957736968994, "step": 7394 }, { "epoch": 0.2347936507936508, "grad_norm": 0.181640625, "learning_rate": 0.1, "loss": 2.0869646072387695, "step": 7396 }, { "epoch": 0.23485714285714285, "grad_norm": 0.1494140625, "learning_rate": 0.1, "loss": 2.0818426609039307, "step": 7398 }, { "epoch": 0.23492063492063492, "grad_norm": 0.09326171875, "learning_rate": 0.1, "loss": 2.1453909873962402, "step": 7400 }, { "epoch": 0.234984126984127, "grad_norm": 0.13671875, "learning_rate": 0.1, "loss": 2.1280429363250732, "step": 7402 }, { "epoch": 0.23504761904761906, "grad_norm": 0.2138671875, "learning_rate": 0.1, "loss": 2.1358985900878906, "step": 7404 }, { "epoch": 0.2351111111111111, "grad_norm": 0.197265625, "learning_rate": 0.1, "loss": 2.12707257270813, "step": 7406 }, { "epoch": 0.23517460317460318, "grad_norm": 0.10205078125, "learning_rate": 0.1, "loss": 2.105736017227173, "step": 7408 }, { "epoch": 0.23523809523809525, "grad_norm": 0.2275390625, "learning_rate": 0.1, "loss": 2.0904200077056885, "step": 7410 }, { "epoch": 0.2353015873015873, "grad_norm": 0.1865234375, "learning_rate": 0.1, "loss": 2.0633704662323, "step": 7412 }, { "epoch": 0.23536507936507936, "grad_norm": 0.07470703125, "learning_rate": 0.1, "loss": 2.11663818359375, "step": 7414 }, { "epoch": 0.23542857142857143, "grad_norm": 0.10498046875, "learning_rate": 0.1, "loss": 2.1126291751861572, "step": 7416 }, { "epoch": 0.2354920634920635, "grad_norm": 0.275390625, "learning_rate": 0.1, "loss": 2.1367883682250977, "step": 7418 }, { "epoch": 0.23555555555555555, "grad_norm": 0.1611328125, "learning_rate": 0.1, "loss": 2.1149706840515137, "step": 7420 }, { "epoch": 0.23561904761904762, "grad_norm": 0.2255859375, "learning_rate": 0.1, "loss": 2.1011087894439697, "step": 7422 }, { "epoch": 0.2356825396825397, "grad_norm": 0.173828125, "learning_rate": 0.1, "loss": 2.076941728591919, "step": 7424 }, { "epoch": 0.23574603174603176, "grad_norm": 0.08642578125, "learning_rate": 0.1, "loss": 2.1073803901672363, "step": 7426 }, { "epoch": 0.2358095238095238, "grad_norm": 0.271484375, "learning_rate": 0.1, "loss": 2.09804105758667, "step": 7428 }, { "epoch": 0.23587301587301587, "grad_norm": 0.1962890625, "learning_rate": 0.1, "loss": 2.112666606903076, "step": 7430 }, { "epoch": 0.23593650793650794, "grad_norm": 0.1259765625, "learning_rate": 0.1, "loss": 2.0902698040008545, "step": 7432 }, { "epoch": 0.236, "grad_norm": 0.0810546875, "learning_rate": 0.1, "loss": 2.1277666091918945, "step": 7434 }, { "epoch": 0.23606349206349206, "grad_norm": 0.1259765625, "learning_rate": 0.1, "loss": 2.1026201248168945, "step": 7436 }, { "epoch": 0.23612698412698413, "grad_norm": 0.08056640625, "learning_rate": 0.1, "loss": 2.107773780822754, "step": 7438 }, { "epoch": 0.2361904761904762, "grad_norm": 0.2265625, "learning_rate": 0.1, "loss": 2.1171152591705322, "step": 7440 }, { "epoch": 0.23625396825396824, "grad_norm": 0.1123046875, "learning_rate": 0.1, "loss": 2.107445478439331, "step": 7442 }, { "epoch": 0.23631746031746032, "grad_norm": 0.173828125, "learning_rate": 0.1, "loss": 2.107590436935425, "step": 7444 }, { "epoch": 0.23638095238095239, "grad_norm": 0.2431640625, "learning_rate": 0.1, "loss": 2.111647129058838, "step": 7446 }, { "epoch": 0.23644444444444446, "grad_norm": 0.140625, "learning_rate": 0.1, "loss": 2.1118881702423096, "step": 7448 }, { "epoch": 0.2365079365079365, "grad_norm": 0.08251953125, "learning_rate": 0.1, "loss": 2.088986873626709, "step": 7450 }, { "epoch": 0.23657142857142857, "grad_norm": 0.1513671875, "learning_rate": 0.1, "loss": 2.1222445964813232, "step": 7452 }, { "epoch": 0.23663492063492064, "grad_norm": 0.19140625, "learning_rate": 0.1, "loss": 2.08857798576355, "step": 7454 }, { "epoch": 0.23669841269841269, "grad_norm": 0.2373046875, "learning_rate": 0.1, "loss": 2.09635591506958, "step": 7456 }, { "epoch": 0.23676190476190476, "grad_norm": 0.1298828125, "learning_rate": 0.1, "loss": 2.1011574268341064, "step": 7458 }, { "epoch": 0.23682539682539683, "grad_norm": 0.07177734375, "learning_rate": 0.1, "loss": 2.0853559970855713, "step": 7460 }, { "epoch": 0.2368888888888889, "grad_norm": 0.1162109375, "learning_rate": 0.1, "loss": 2.069883346557617, "step": 7462 }, { "epoch": 0.23695238095238094, "grad_norm": 0.1181640625, "learning_rate": 0.1, "loss": 2.0954017639160156, "step": 7464 }, { "epoch": 0.237015873015873, "grad_norm": 0.2470703125, "learning_rate": 0.1, "loss": 2.108976125717163, "step": 7466 }, { "epoch": 0.23707936507936508, "grad_norm": 0.357421875, "learning_rate": 0.1, "loss": 2.1600942611694336, "step": 7468 }, { "epoch": 0.23714285714285716, "grad_norm": 0.1376953125, "learning_rate": 0.1, "loss": 2.140178918838501, "step": 7470 }, { "epoch": 0.2372063492063492, "grad_norm": 0.208984375, "learning_rate": 0.1, "loss": 2.152268409729004, "step": 7472 }, { "epoch": 0.23726984126984127, "grad_norm": 0.14453125, "learning_rate": 0.1, "loss": 2.1198067665100098, "step": 7474 }, { "epoch": 0.23733333333333334, "grad_norm": 0.1669921875, "learning_rate": 0.1, "loss": 2.0796287059783936, "step": 7476 }, { "epoch": 0.23739682539682538, "grad_norm": 0.10400390625, "learning_rate": 0.1, "loss": 2.1004858016967773, "step": 7478 }, { "epoch": 0.23746031746031745, "grad_norm": 0.05908203125, "learning_rate": 0.1, "loss": 2.0791149139404297, "step": 7480 }, { "epoch": 0.23752380952380953, "grad_norm": 0.0859375, "learning_rate": 0.1, "loss": 2.1030054092407227, "step": 7482 }, { "epoch": 0.2375873015873016, "grad_norm": 0.1728515625, "learning_rate": 0.1, "loss": 2.1030256748199463, "step": 7484 }, { "epoch": 0.23765079365079364, "grad_norm": 0.2255859375, "learning_rate": 0.1, "loss": 2.1115081310272217, "step": 7486 }, { "epoch": 0.2377142857142857, "grad_norm": 0.2080078125, "learning_rate": 0.1, "loss": 2.107701539993286, "step": 7488 }, { "epoch": 0.23777777777777778, "grad_norm": 0.1806640625, "learning_rate": 0.1, "loss": 2.1178464889526367, "step": 7490 }, { "epoch": 0.23784126984126985, "grad_norm": 0.09765625, "learning_rate": 0.1, "loss": 2.122328042984009, "step": 7492 }, { "epoch": 0.2379047619047619, "grad_norm": 0.193359375, "learning_rate": 0.1, "loss": 2.142416000366211, "step": 7494 }, { "epoch": 0.23796825396825397, "grad_norm": 0.07861328125, "learning_rate": 0.1, "loss": 2.1194045543670654, "step": 7496 }, { "epoch": 0.23803174603174604, "grad_norm": 0.06640625, "learning_rate": 0.1, "loss": 2.120892286300659, "step": 7498 }, { "epoch": 0.23809523809523808, "grad_norm": 0.1591796875, "learning_rate": 0.1, "loss": 2.1126275062561035, "step": 7500 }, { "epoch": 0.23815873015873015, "grad_norm": 0.10546875, "learning_rate": 0.1, "loss": 2.0881261825561523, "step": 7502 }, { "epoch": 0.23822222222222222, "grad_norm": 0.2373046875, "learning_rate": 0.1, "loss": 2.1193575859069824, "step": 7504 }, { "epoch": 0.2382857142857143, "grad_norm": 0.439453125, "learning_rate": 0.1, "loss": 2.123939037322998, "step": 7506 }, { "epoch": 0.23834920634920634, "grad_norm": 0.26171875, "learning_rate": 0.1, "loss": 2.1399338245391846, "step": 7508 }, { "epoch": 0.2384126984126984, "grad_norm": 0.150390625, "learning_rate": 0.1, "loss": 2.1312966346740723, "step": 7510 }, { "epoch": 0.23847619047619048, "grad_norm": 0.1064453125, "learning_rate": 0.1, "loss": 2.1331863403320312, "step": 7512 }, { "epoch": 0.23853968253968255, "grad_norm": 0.083984375, "learning_rate": 0.1, "loss": 2.144838333129883, "step": 7514 }, { "epoch": 0.2386031746031746, "grad_norm": 0.15625, "learning_rate": 0.1, "loss": 2.130389928817749, "step": 7516 }, { "epoch": 0.23866666666666667, "grad_norm": 0.275390625, "learning_rate": 0.1, "loss": 2.122476100921631, "step": 7518 }, { "epoch": 0.23873015873015874, "grad_norm": 0.21484375, "learning_rate": 0.1, "loss": 2.1233088970184326, "step": 7520 }, { "epoch": 0.23879365079365078, "grad_norm": 0.08544921875, "learning_rate": 0.1, "loss": 2.134186267852783, "step": 7522 }, { "epoch": 0.23885714285714285, "grad_norm": 0.109375, "learning_rate": 0.1, "loss": 2.120248794555664, "step": 7524 }, { "epoch": 0.23892063492063492, "grad_norm": 0.177734375, "learning_rate": 0.1, "loss": 2.1255345344543457, "step": 7526 }, { "epoch": 0.238984126984127, "grad_norm": 0.09814453125, "learning_rate": 0.1, "loss": 2.1197824478149414, "step": 7528 }, { "epoch": 0.23904761904761904, "grad_norm": 0.11865234375, "learning_rate": 0.1, "loss": 2.1293745040893555, "step": 7530 }, { "epoch": 0.2391111111111111, "grad_norm": 0.115234375, "learning_rate": 0.1, "loss": 2.138209819793701, "step": 7532 }, { "epoch": 0.23917460317460318, "grad_norm": 0.2001953125, "learning_rate": 0.1, "loss": 2.142091989517212, "step": 7534 }, { "epoch": 0.23923809523809525, "grad_norm": 0.2216796875, "learning_rate": 0.1, "loss": 2.1212503910064697, "step": 7536 }, { "epoch": 0.2393015873015873, "grad_norm": 0.0830078125, "learning_rate": 0.1, "loss": 2.1592438220977783, "step": 7538 }, { "epoch": 0.23936507936507936, "grad_norm": 0.2138671875, "learning_rate": 0.1, "loss": 2.1681196689605713, "step": 7540 }, { "epoch": 0.23942857142857144, "grad_norm": 0.09130859375, "learning_rate": 0.1, "loss": 2.1331288814544678, "step": 7542 }, { "epoch": 0.23949206349206348, "grad_norm": 0.07275390625, "learning_rate": 0.1, "loss": 2.148681163787842, "step": 7544 }, { "epoch": 0.23955555555555555, "grad_norm": 0.1337890625, "learning_rate": 0.1, "loss": 2.1129119396209717, "step": 7546 }, { "epoch": 0.23961904761904762, "grad_norm": 0.40234375, "learning_rate": 0.1, "loss": 2.1356449127197266, "step": 7548 }, { "epoch": 0.2396825396825397, "grad_norm": 0.341796875, "learning_rate": 0.1, "loss": 2.1507699489593506, "step": 7550 }, { "epoch": 0.23974603174603173, "grad_norm": 0.15234375, "learning_rate": 0.1, "loss": 2.11820912361145, "step": 7552 }, { "epoch": 0.2398095238095238, "grad_norm": 0.10595703125, "learning_rate": 0.1, "loss": 2.1351945400238037, "step": 7554 }, { "epoch": 0.23987301587301588, "grad_norm": 0.21875, "learning_rate": 0.1, "loss": 2.1485698223114014, "step": 7556 }, { "epoch": 0.23993650793650795, "grad_norm": 0.255859375, "learning_rate": 0.1, "loss": 2.150482416152954, "step": 7558 }, { "epoch": 0.24, "grad_norm": 0.10595703125, "learning_rate": 0.1, "loss": 2.1111397743225098, "step": 7560 }, { "epoch": 0.24006349206349206, "grad_norm": 0.12890625, "learning_rate": 0.1, "loss": 2.108712673187256, "step": 7562 }, { "epoch": 0.24012698412698413, "grad_norm": 0.1064453125, "learning_rate": 0.1, "loss": 2.1471011638641357, "step": 7564 }, { "epoch": 0.2401904761904762, "grad_norm": 0.12890625, "learning_rate": 0.1, "loss": 2.1378467082977295, "step": 7566 }, { "epoch": 0.24025396825396825, "grad_norm": 0.2041015625, "learning_rate": 0.1, "loss": 2.1189959049224854, "step": 7568 }, { "epoch": 0.24031746031746032, "grad_norm": 0.0830078125, "learning_rate": 0.1, "loss": 2.15275239944458, "step": 7570 }, { "epoch": 0.2403809523809524, "grad_norm": 0.1474609375, "learning_rate": 0.1, "loss": 2.096132516860962, "step": 7572 }, { "epoch": 0.24044444444444443, "grad_norm": 0.25390625, "learning_rate": 0.1, "loss": 2.1388909816741943, "step": 7574 }, { "epoch": 0.2405079365079365, "grad_norm": 0.310546875, "learning_rate": 0.1, "loss": 2.1405465602874756, "step": 7576 }, { "epoch": 0.24057142857142857, "grad_norm": 0.0576171875, "learning_rate": 0.1, "loss": 2.168820858001709, "step": 7578 }, { "epoch": 0.24063492063492065, "grad_norm": 0.080078125, "learning_rate": 0.1, "loss": 2.110339403152466, "step": 7580 }, { "epoch": 0.2406984126984127, "grad_norm": 0.09130859375, "learning_rate": 0.1, "loss": 2.1240358352661133, "step": 7582 }, { "epoch": 0.24076190476190476, "grad_norm": 0.103515625, "learning_rate": 0.1, "loss": 2.0973892211914062, "step": 7584 }, { "epoch": 0.24082539682539683, "grad_norm": 0.08935546875, "learning_rate": 0.1, "loss": 2.1381688117980957, "step": 7586 }, { "epoch": 0.2408888888888889, "grad_norm": 0.14453125, "learning_rate": 0.1, "loss": 2.106537103652954, "step": 7588 }, { "epoch": 0.24095238095238095, "grad_norm": 0.2392578125, "learning_rate": 0.1, "loss": 2.1185178756713867, "step": 7590 }, { "epoch": 0.24101587301587302, "grad_norm": 0.19140625, "learning_rate": 0.1, "loss": 2.120662212371826, "step": 7592 }, { "epoch": 0.2410793650793651, "grad_norm": 0.197265625, "learning_rate": 0.1, "loss": 2.130314588546753, "step": 7594 }, { "epoch": 0.24114285714285713, "grad_norm": 0.1279296875, "learning_rate": 0.1, "loss": 2.1311545372009277, "step": 7596 }, { "epoch": 0.2412063492063492, "grad_norm": 0.1201171875, "learning_rate": 0.1, "loss": 2.1440396308898926, "step": 7598 }, { "epoch": 0.24126984126984127, "grad_norm": 0.1953125, "learning_rate": 0.1, "loss": 2.121884822845459, "step": 7600 }, { "epoch": 0.24133333333333334, "grad_norm": 0.07080078125, "learning_rate": 0.1, "loss": 2.1237287521362305, "step": 7602 }, { "epoch": 0.2413968253968254, "grad_norm": 0.1826171875, "learning_rate": 0.1, "loss": 2.1442861557006836, "step": 7604 }, { "epoch": 0.24146031746031746, "grad_norm": 0.49609375, "learning_rate": 0.1, "loss": 2.095569372177124, "step": 7606 }, { "epoch": 0.24152380952380953, "grad_norm": 0.12109375, "learning_rate": 0.1, "loss": 2.1049482822418213, "step": 7608 }, { "epoch": 0.2415873015873016, "grad_norm": 0.10888671875, "learning_rate": 0.1, "loss": 2.1004626750946045, "step": 7610 }, { "epoch": 0.24165079365079364, "grad_norm": 0.10546875, "learning_rate": 0.1, "loss": 2.107957363128662, "step": 7612 }, { "epoch": 0.24171428571428571, "grad_norm": 0.05810546875, "learning_rate": 0.1, "loss": 2.1209492683410645, "step": 7614 }, { "epoch": 0.24177777777777779, "grad_norm": 0.08251953125, "learning_rate": 0.1, "loss": 2.1376209259033203, "step": 7616 }, { "epoch": 0.24184126984126983, "grad_norm": 0.09521484375, "learning_rate": 0.1, "loss": 2.126371383666992, "step": 7618 }, { "epoch": 0.2419047619047619, "grad_norm": 0.057373046875, "learning_rate": 0.1, "loss": 2.1156535148620605, "step": 7620 }, { "epoch": 0.24196825396825397, "grad_norm": 0.12451171875, "learning_rate": 0.1, "loss": 2.100332260131836, "step": 7622 }, { "epoch": 0.24203174603174604, "grad_norm": 0.3359375, "learning_rate": 0.1, "loss": 2.0983855724334717, "step": 7624 }, { "epoch": 0.24209523809523809, "grad_norm": 0.2412109375, "learning_rate": 0.1, "loss": 2.110562324523926, "step": 7626 }, { "epoch": 0.24215873015873016, "grad_norm": 0.06396484375, "learning_rate": 0.1, "loss": 2.120020866394043, "step": 7628 }, { "epoch": 0.24222222222222223, "grad_norm": 0.2265625, "learning_rate": 0.1, "loss": 2.1192984580993652, "step": 7630 }, { "epoch": 0.2422857142857143, "grad_norm": 0.3125, "learning_rate": 0.1, "loss": 2.1244804859161377, "step": 7632 }, { "epoch": 0.24234920634920634, "grad_norm": 0.15234375, "learning_rate": 0.1, "loss": 2.113220691680908, "step": 7634 }, { "epoch": 0.2424126984126984, "grad_norm": 0.099609375, "learning_rate": 0.1, "loss": 2.09896183013916, "step": 7636 }, { "epoch": 0.24247619047619048, "grad_norm": 0.1640625, "learning_rate": 0.1, "loss": 2.1062746047973633, "step": 7638 }, { "epoch": 0.24253968253968253, "grad_norm": 0.076171875, "learning_rate": 0.1, "loss": 2.0957741737365723, "step": 7640 }, { "epoch": 0.2426031746031746, "grad_norm": 0.2197265625, "learning_rate": 0.1, "loss": 2.1229372024536133, "step": 7642 }, { "epoch": 0.24266666666666667, "grad_norm": 0.29296875, "learning_rate": 0.1, "loss": 2.109020471572876, "step": 7644 }, { "epoch": 0.24273015873015874, "grad_norm": 0.2470703125, "learning_rate": 0.1, "loss": 2.1225526332855225, "step": 7646 }, { "epoch": 0.24279365079365078, "grad_norm": 0.1064453125, "learning_rate": 0.1, "loss": 2.0903031826019287, "step": 7648 }, { "epoch": 0.24285714285714285, "grad_norm": 0.1162109375, "learning_rate": 0.1, "loss": 2.0699353218078613, "step": 7650 }, { "epoch": 0.24292063492063493, "grad_norm": 0.06982421875, "learning_rate": 0.1, "loss": 2.1053171157836914, "step": 7652 }, { "epoch": 0.242984126984127, "grad_norm": 0.1279296875, "learning_rate": 0.1, "loss": 2.0997400283813477, "step": 7654 }, { "epoch": 0.24304761904761904, "grad_norm": 0.08154296875, "learning_rate": 0.1, "loss": 2.1162967681884766, "step": 7656 }, { "epoch": 0.2431111111111111, "grad_norm": 0.06640625, "learning_rate": 0.1, "loss": 2.1261515617370605, "step": 7658 }, { "epoch": 0.24317460317460318, "grad_norm": 0.1484375, "learning_rate": 0.1, "loss": 2.093658685684204, "step": 7660 }, { "epoch": 0.24323809523809523, "grad_norm": 0.146484375, "learning_rate": 0.1, "loss": 2.099531888961792, "step": 7662 }, { "epoch": 0.2433015873015873, "grad_norm": 0.142578125, "learning_rate": 0.1, "loss": 2.098545789718628, "step": 7664 }, { "epoch": 0.24336507936507937, "grad_norm": 0.283203125, "learning_rate": 0.1, "loss": 2.0889580249786377, "step": 7666 }, { "epoch": 0.24342857142857144, "grad_norm": 0.244140625, "learning_rate": 0.1, "loss": 2.082266330718994, "step": 7668 }, { "epoch": 0.24349206349206348, "grad_norm": 0.09814453125, "learning_rate": 0.1, "loss": 2.0861690044403076, "step": 7670 }, { "epoch": 0.24355555555555555, "grad_norm": 0.1669921875, "learning_rate": 0.1, "loss": 2.113250732421875, "step": 7672 }, { "epoch": 0.24361904761904762, "grad_norm": 0.2392578125, "learning_rate": 0.1, "loss": 2.147066593170166, "step": 7674 }, { "epoch": 0.2436825396825397, "grad_norm": 0.1162109375, "learning_rate": 0.1, "loss": 2.146116256713867, "step": 7676 }, { "epoch": 0.24374603174603174, "grad_norm": 0.16796875, "learning_rate": 0.1, "loss": 2.1041014194488525, "step": 7678 }, { "epoch": 0.2438095238095238, "grad_norm": 0.12109375, "learning_rate": 0.1, "loss": 2.1133410930633545, "step": 7680 }, { "epoch": 0.24387301587301588, "grad_norm": 0.23046875, "learning_rate": 0.1, "loss": 2.0914034843444824, "step": 7682 }, { "epoch": 0.24393650793650792, "grad_norm": 0.1396484375, "learning_rate": 0.1, "loss": 2.0841925144195557, "step": 7684 }, { "epoch": 0.244, "grad_norm": 0.0859375, "learning_rate": 0.1, "loss": 2.0767691135406494, "step": 7686 }, { "epoch": 0.24406349206349207, "grad_norm": 0.056640625, "learning_rate": 0.1, "loss": 2.095615863800049, "step": 7688 }, { "epoch": 0.24412698412698414, "grad_norm": 0.1787109375, "learning_rate": 0.1, "loss": 2.0853114128112793, "step": 7690 }, { "epoch": 0.24419047619047618, "grad_norm": 0.267578125, "learning_rate": 0.1, "loss": 2.0804555416107178, "step": 7692 }, { "epoch": 0.24425396825396825, "grad_norm": 0.12890625, "learning_rate": 0.1, "loss": 2.0776097774505615, "step": 7694 }, { "epoch": 0.24431746031746032, "grad_norm": 0.07470703125, "learning_rate": 0.1, "loss": 2.124927282333374, "step": 7696 }, { "epoch": 0.2443809523809524, "grad_norm": 0.2470703125, "learning_rate": 0.1, "loss": 2.095863103866577, "step": 7698 }, { "epoch": 0.24444444444444444, "grad_norm": 0.33984375, "learning_rate": 0.1, "loss": 2.1371068954467773, "step": 7700 }, { "epoch": 0.2445079365079365, "grad_norm": 0.056640625, "learning_rate": 0.1, "loss": 2.0929276943206787, "step": 7702 }, { "epoch": 0.24457142857142858, "grad_norm": 0.10400390625, "learning_rate": 0.1, "loss": 2.104721784591675, "step": 7704 }, { "epoch": 0.24463492063492062, "grad_norm": 0.11474609375, "learning_rate": 0.1, "loss": 2.08351731300354, "step": 7706 }, { "epoch": 0.2446984126984127, "grad_norm": 0.234375, "learning_rate": 0.1, "loss": 2.108940839767456, "step": 7708 }, { "epoch": 0.24476190476190476, "grad_norm": 0.248046875, "learning_rate": 0.1, "loss": 2.1364517211914062, "step": 7710 }, { "epoch": 0.24482539682539683, "grad_norm": 0.10546875, "learning_rate": 0.1, "loss": 2.104205846786499, "step": 7712 }, { "epoch": 0.24488888888888888, "grad_norm": 0.15625, "learning_rate": 0.1, "loss": 2.0820083618164062, "step": 7714 }, { "epoch": 0.24495238095238095, "grad_norm": 0.0810546875, "learning_rate": 0.1, "loss": 2.1198596954345703, "step": 7716 }, { "epoch": 0.24501587301587302, "grad_norm": 0.1396484375, "learning_rate": 0.1, "loss": 2.0827841758728027, "step": 7718 }, { "epoch": 0.2450793650793651, "grad_norm": 0.1708984375, "learning_rate": 0.1, "loss": 2.100688934326172, "step": 7720 }, { "epoch": 0.24514285714285713, "grad_norm": 0.34765625, "learning_rate": 0.1, "loss": 2.1275124549865723, "step": 7722 }, { "epoch": 0.2452063492063492, "grad_norm": 0.076171875, "learning_rate": 0.1, "loss": 2.0562610626220703, "step": 7724 }, { "epoch": 0.24526984126984128, "grad_norm": 0.271484375, "learning_rate": 0.1, "loss": 2.088521957397461, "step": 7726 }, { "epoch": 0.24533333333333332, "grad_norm": 0.212890625, "learning_rate": 0.1, "loss": 2.1251540184020996, "step": 7728 }, { "epoch": 0.2453968253968254, "grad_norm": 0.1572265625, "learning_rate": 0.1, "loss": 2.110805034637451, "step": 7730 }, { "epoch": 0.24546031746031746, "grad_norm": 0.291015625, "learning_rate": 0.1, "loss": 2.1409051418304443, "step": 7732 }, { "epoch": 0.24552380952380953, "grad_norm": 0.072265625, "learning_rate": 0.1, "loss": 2.089325428009033, "step": 7734 }, { "epoch": 0.24558730158730158, "grad_norm": 0.09619140625, "learning_rate": 0.1, "loss": 2.116668224334717, "step": 7736 }, { "epoch": 0.24565079365079365, "grad_norm": 0.271484375, "learning_rate": 0.1, "loss": 2.124164581298828, "step": 7738 }, { "epoch": 0.24571428571428572, "grad_norm": 0.150390625, "learning_rate": 0.1, "loss": 2.1034581661224365, "step": 7740 }, { "epoch": 0.2457777777777778, "grad_norm": 0.08251953125, "learning_rate": 0.1, "loss": 2.0777595043182373, "step": 7742 }, { "epoch": 0.24584126984126983, "grad_norm": 0.0732421875, "learning_rate": 0.1, "loss": 2.087156057357788, "step": 7744 }, { "epoch": 0.2459047619047619, "grad_norm": 0.2001953125, "learning_rate": 0.1, "loss": 2.0853726863861084, "step": 7746 }, { "epoch": 0.24596825396825397, "grad_norm": 0.34375, "learning_rate": 0.1, "loss": 2.1072938442230225, "step": 7748 }, { "epoch": 0.24603174603174602, "grad_norm": 0.0869140625, "learning_rate": 0.1, "loss": 2.0686442852020264, "step": 7750 }, { "epoch": 0.2460952380952381, "grad_norm": 0.1474609375, "learning_rate": 0.1, "loss": 2.0939245223999023, "step": 7752 }, { "epoch": 0.24615873015873016, "grad_norm": 0.095703125, "learning_rate": 0.1, "loss": 2.0739314556121826, "step": 7754 }, { "epoch": 0.24622222222222223, "grad_norm": 0.1572265625, "learning_rate": 0.1, "loss": 2.128819465637207, "step": 7756 }, { "epoch": 0.24628571428571427, "grad_norm": 0.1171875, "learning_rate": 0.1, "loss": 2.080601453781128, "step": 7758 }, { "epoch": 0.24634920634920635, "grad_norm": 0.0625, "learning_rate": 0.1, "loss": 2.1003646850585938, "step": 7760 }, { "epoch": 0.24641269841269842, "grad_norm": 0.134765625, "learning_rate": 0.1, "loss": 2.07106614112854, "step": 7762 }, { "epoch": 0.2464761904761905, "grad_norm": 0.26171875, "learning_rate": 0.1, "loss": 2.0942091941833496, "step": 7764 }, { "epoch": 0.24653968253968253, "grad_norm": 0.125, "learning_rate": 0.1, "loss": 2.0892837047576904, "step": 7766 }, { "epoch": 0.2466031746031746, "grad_norm": 0.171875, "learning_rate": 0.1, "loss": 2.1195573806762695, "step": 7768 }, { "epoch": 0.24666666666666667, "grad_norm": 0.28125, "learning_rate": 0.1, "loss": 2.1099534034729004, "step": 7770 }, { "epoch": 0.24673015873015874, "grad_norm": 0.134765625, "learning_rate": 0.1, "loss": 2.081984281539917, "step": 7772 }, { "epoch": 0.2467936507936508, "grad_norm": 0.205078125, "learning_rate": 0.1, "loss": 2.0995848178863525, "step": 7774 }, { "epoch": 0.24685714285714286, "grad_norm": 0.1826171875, "learning_rate": 0.1, "loss": 2.1072371006011963, "step": 7776 }, { "epoch": 0.24692063492063493, "grad_norm": 0.08642578125, "learning_rate": 0.1, "loss": 2.0923726558685303, "step": 7778 }, { "epoch": 0.24698412698412697, "grad_norm": 0.052978515625, "learning_rate": 0.1, "loss": 2.088723659515381, "step": 7780 }, { "epoch": 0.24704761904761904, "grad_norm": 0.16796875, "learning_rate": 0.1, "loss": 2.112581968307495, "step": 7782 }, { "epoch": 0.24711111111111111, "grad_norm": 0.244140625, "learning_rate": 0.1, "loss": 2.115924596786499, "step": 7784 }, { "epoch": 0.24717460317460319, "grad_norm": 0.059814453125, "learning_rate": 0.1, "loss": 2.103855609893799, "step": 7786 }, { "epoch": 0.24723809523809523, "grad_norm": 0.169921875, "learning_rate": 0.1, "loss": 2.0790719985961914, "step": 7788 }, { "epoch": 0.2473015873015873, "grad_norm": 0.255859375, "learning_rate": 0.1, "loss": 2.1142492294311523, "step": 7790 }, { "epoch": 0.24736507936507937, "grad_norm": 0.212890625, "learning_rate": 0.1, "loss": 2.140623092651367, "step": 7792 }, { "epoch": 0.24742857142857144, "grad_norm": 0.10205078125, "learning_rate": 0.1, "loss": 2.0839195251464844, "step": 7794 }, { "epoch": 0.24749206349206349, "grad_norm": 0.20703125, "learning_rate": 0.1, "loss": 2.085799217224121, "step": 7796 }, { "epoch": 0.24755555555555556, "grad_norm": 0.1796875, "learning_rate": 0.1, "loss": 2.0974864959716797, "step": 7798 }, { "epoch": 0.24761904761904763, "grad_norm": 0.27734375, "learning_rate": 0.1, "loss": 2.100862741470337, "step": 7800 }, { "epoch": 0.24768253968253967, "grad_norm": 0.265625, "learning_rate": 0.1, "loss": 2.123643636703491, "step": 7802 }, { "epoch": 0.24774603174603174, "grad_norm": 0.1533203125, "learning_rate": 0.1, "loss": 2.111666202545166, "step": 7804 }, { "epoch": 0.2478095238095238, "grad_norm": 0.06640625, "learning_rate": 0.1, "loss": 2.1051220893859863, "step": 7806 }, { "epoch": 0.24787301587301588, "grad_norm": 0.0693359375, "learning_rate": 0.1, "loss": 2.1013715267181396, "step": 7808 }, { "epoch": 0.24793650793650793, "grad_norm": 0.11181640625, "learning_rate": 0.1, "loss": 2.0939769744873047, "step": 7810 }, { "epoch": 0.248, "grad_norm": 0.07275390625, "learning_rate": 0.1, "loss": 2.0908074378967285, "step": 7812 }, { "epoch": 0.24806349206349207, "grad_norm": 0.1025390625, "learning_rate": 0.1, "loss": 2.0955634117126465, "step": 7814 }, { "epoch": 0.24812698412698414, "grad_norm": 0.087890625, "learning_rate": 0.1, "loss": 2.0926597118377686, "step": 7816 }, { "epoch": 0.24819047619047618, "grad_norm": 0.095703125, "learning_rate": 0.1, "loss": 2.1247200965881348, "step": 7818 }, { "epoch": 0.24825396825396825, "grad_norm": 0.421875, "learning_rate": 0.1, "loss": 2.096296548843384, "step": 7820 }, { "epoch": 0.24831746031746033, "grad_norm": 0.396484375, "learning_rate": 0.1, "loss": 2.1253182888031006, "step": 7822 }, { "epoch": 0.24838095238095237, "grad_norm": 0.2001953125, "learning_rate": 0.1, "loss": 2.1058707237243652, "step": 7824 }, { "epoch": 0.24844444444444444, "grad_norm": 0.1513671875, "learning_rate": 0.1, "loss": 2.07863187789917, "step": 7826 }, { "epoch": 0.2485079365079365, "grad_norm": 0.150390625, "learning_rate": 0.1, "loss": 2.123563766479492, "step": 7828 }, { "epoch": 0.24857142857142858, "grad_norm": 0.11669921875, "learning_rate": 0.1, "loss": 2.111006736755371, "step": 7830 }, { "epoch": 0.24863492063492063, "grad_norm": 0.10009765625, "learning_rate": 0.1, "loss": 2.103977680206299, "step": 7832 }, { "epoch": 0.2486984126984127, "grad_norm": 0.11474609375, "learning_rate": 0.1, "loss": 2.104381561279297, "step": 7834 }, { "epoch": 0.24876190476190477, "grad_norm": 0.16015625, "learning_rate": 0.1, "loss": 2.113999843597412, "step": 7836 }, { "epoch": 0.24882539682539684, "grad_norm": 0.267578125, "learning_rate": 0.1, "loss": 2.0946075916290283, "step": 7838 }, { "epoch": 0.24888888888888888, "grad_norm": 0.1640625, "learning_rate": 0.1, "loss": 2.123718023300171, "step": 7840 }, { "epoch": 0.24895238095238095, "grad_norm": 0.0908203125, "learning_rate": 0.1, "loss": 2.1301300525665283, "step": 7842 }, { "epoch": 0.24901587301587302, "grad_norm": 0.0830078125, "learning_rate": 0.1, "loss": 2.1389126777648926, "step": 7844 }, { "epoch": 0.24907936507936507, "grad_norm": 0.1123046875, "learning_rate": 0.1, "loss": 2.138192892074585, "step": 7846 }, { "epoch": 0.24914285714285714, "grad_norm": 0.28515625, "learning_rate": 0.1, "loss": 2.091545581817627, "step": 7848 }, { "epoch": 0.2492063492063492, "grad_norm": 0.2080078125, "learning_rate": 0.1, "loss": 2.090691089630127, "step": 7850 }, { "epoch": 0.24926984126984128, "grad_norm": 0.1533203125, "learning_rate": 0.1, "loss": 2.1429617404937744, "step": 7852 }, { "epoch": 0.24933333333333332, "grad_norm": 0.08447265625, "learning_rate": 0.1, "loss": 2.1256930828094482, "step": 7854 }, { "epoch": 0.2493968253968254, "grad_norm": 0.0830078125, "learning_rate": 0.1, "loss": 2.0978100299835205, "step": 7856 }, { "epoch": 0.24946031746031747, "grad_norm": 0.240234375, "learning_rate": 0.1, "loss": 2.1158103942871094, "step": 7858 }, { "epoch": 0.24952380952380954, "grad_norm": 0.236328125, "learning_rate": 0.1, "loss": 2.1170918941497803, "step": 7860 }, { "epoch": 0.24958730158730158, "grad_norm": 0.11181640625, "learning_rate": 0.1, "loss": 2.0898027420043945, "step": 7862 }, { "epoch": 0.24965079365079365, "grad_norm": 0.1650390625, "learning_rate": 0.1, "loss": 2.1037285327911377, "step": 7864 }, { "epoch": 0.24971428571428572, "grad_norm": 0.1474609375, "learning_rate": 0.1, "loss": 2.1031713485717773, "step": 7866 }, { "epoch": 0.24977777777777777, "grad_norm": 0.11572265625, "learning_rate": 0.1, "loss": 2.089454174041748, "step": 7868 }, { "epoch": 0.24984126984126984, "grad_norm": 0.13671875, "learning_rate": 0.1, "loss": 2.1241745948791504, "step": 7870 }, { "epoch": 0.2499047619047619, "grad_norm": 0.185546875, "learning_rate": 0.1, "loss": 2.114145278930664, "step": 7872 }, { "epoch": 0.24996825396825398, "grad_norm": 0.259765625, "learning_rate": 0.1, "loss": 2.1260359287261963, "step": 7874 }, { "epoch": 0.25003174603174605, "grad_norm": 0.2021484375, "learning_rate": 0.1, "loss": 2.1129047870635986, "step": 7876 }, { "epoch": 0.2500952380952381, "grad_norm": 0.0888671875, "learning_rate": 0.1, "loss": 2.105137586593628, "step": 7878 }, { "epoch": 0.25015873015873014, "grad_norm": 0.158203125, "learning_rate": 0.1, "loss": 2.140394926071167, "step": 7880 }, { "epoch": 0.25022222222222223, "grad_norm": 0.07421875, "learning_rate": 0.1, "loss": 2.1253068447113037, "step": 7882 }, { "epoch": 0.2502857142857143, "grad_norm": 0.1806640625, "learning_rate": 0.1, "loss": 2.109440565109253, "step": 7884 }, { "epoch": 0.2503492063492064, "grad_norm": 0.0966796875, "learning_rate": 0.1, "loss": 2.124499797821045, "step": 7886 }, { "epoch": 0.2504126984126984, "grad_norm": 0.4921875, "learning_rate": 0.1, "loss": 2.1362109184265137, "step": 7888 }, { "epoch": 0.25047619047619046, "grad_norm": 0.1513671875, "learning_rate": 0.1, "loss": 2.1115052700042725, "step": 7890 }, { "epoch": 0.25053968253968256, "grad_norm": 0.1318359375, "learning_rate": 0.1, "loss": 2.1453135013580322, "step": 7892 }, { "epoch": 0.2506031746031746, "grad_norm": 0.11474609375, "learning_rate": 0.1, "loss": 2.1274404525756836, "step": 7894 }, { "epoch": 0.25066666666666665, "grad_norm": 0.1240234375, "learning_rate": 0.1, "loss": 2.1466567516326904, "step": 7896 }, { "epoch": 0.25073015873015875, "grad_norm": 0.08984375, "learning_rate": 0.1, "loss": 2.1065921783447266, "step": 7898 }, { "epoch": 0.2507936507936508, "grad_norm": 0.10107421875, "learning_rate": 0.1, "loss": 2.1110141277313232, "step": 7900 }, { "epoch": 0.25085714285714283, "grad_norm": 0.0634765625, "learning_rate": 0.1, "loss": 2.0745911598205566, "step": 7902 }, { "epoch": 0.25092063492063493, "grad_norm": 0.10595703125, "learning_rate": 0.1, "loss": 2.1433346271514893, "step": 7904 }, { "epoch": 0.250984126984127, "grad_norm": 0.1884765625, "learning_rate": 0.1, "loss": 2.0971295833587646, "step": 7906 }, { "epoch": 0.2510476190476191, "grad_norm": 0.111328125, "learning_rate": 0.1, "loss": 2.086148738861084, "step": 7908 }, { "epoch": 0.2511111111111111, "grad_norm": 0.12158203125, "learning_rate": 0.1, "loss": 2.095869779586792, "step": 7910 }, { "epoch": 0.25117460317460316, "grad_norm": 0.392578125, "learning_rate": 0.1, "loss": 2.1408135890960693, "step": 7912 }, { "epoch": 0.25123809523809526, "grad_norm": 0.177734375, "learning_rate": 0.1, "loss": 2.1234843730926514, "step": 7914 }, { "epoch": 0.2513015873015873, "grad_norm": 0.1123046875, "learning_rate": 0.1, "loss": 2.100193500518799, "step": 7916 }, { "epoch": 0.25136507936507935, "grad_norm": 0.19140625, "learning_rate": 0.1, "loss": 2.116718292236328, "step": 7918 }, { "epoch": 0.25142857142857145, "grad_norm": 0.318359375, "learning_rate": 0.1, "loss": 2.1161344051361084, "step": 7920 }, { "epoch": 0.2514920634920635, "grad_norm": 0.2353515625, "learning_rate": 0.1, "loss": 2.1228482723236084, "step": 7922 }, { "epoch": 0.25155555555555553, "grad_norm": 0.26953125, "learning_rate": 0.1, "loss": 2.101938486099243, "step": 7924 }, { "epoch": 0.25161904761904763, "grad_norm": 0.208984375, "learning_rate": 0.1, "loss": 2.125636339187622, "step": 7926 }, { "epoch": 0.2516825396825397, "grad_norm": 0.10009765625, "learning_rate": 0.1, "loss": 2.130795955657959, "step": 7928 }, { "epoch": 0.2517460317460318, "grad_norm": 0.0859375, "learning_rate": 0.1, "loss": 2.141479253768921, "step": 7930 }, { "epoch": 0.2518095238095238, "grad_norm": 0.083984375, "learning_rate": 0.1, "loss": 2.133981943130493, "step": 7932 }, { "epoch": 0.25187301587301586, "grad_norm": 0.1728515625, "learning_rate": 0.1, "loss": 2.096903085708618, "step": 7934 }, { "epoch": 0.25193650793650796, "grad_norm": 0.138671875, "learning_rate": 0.1, "loss": 2.101020574569702, "step": 7936 }, { "epoch": 0.252, "grad_norm": 0.0634765625, "learning_rate": 0.1, "loss": 2.149923324584961, "step": 7938 }, { "epoch": 0.25206349206349205, "grad_norm": 0.056640625, "learning_rate": 0.1, "loss": 2.1447412967681885, "step": 7940 }, { "epoch": 0.25212698412698414, "grad_norm": 0.1865234375, "learning_rate": 0.1, "loss": 2.1088924407958984, "step": 7942 }, { "epoch": 0.2521904761904762, "grad_norm": 0.287109375, "learning_rate": 0.1, "loss": 2.143820285797119, "step": 7944 }, { "epoch": 0.25225396825396823, "grad_norm": 0.1611328125, "learning_rate": 0.1, "loss": 2.123006820678711, "step": 7946 }, { "epoch": 0.25231746031746033, "grad_norm": 0.1630859375, "learning_rate": 0.1, "loss": 2.115077257156372, "step": 7948 }, { "epoch": 0.2523809523809524, "grad_norm": 0.0703125, "learning_rate": 0.1, "loss": 2.132873296737671, "step": 7950 }, { "epoch": 0.25244444444444447, "grad_norm": 0.1220703125, "learning_rate": 0.1, "loss": 2.119615077972412, "step": 7952 }, { "epoch": 0.2525079365079365, "grad_norm": 0.09228515625, "learning_rate": 0.1, "loss": 2.0982370376586914, "step": 7954 }, { "epoch": 0.25257142857142856, "grad_norm": 0.1591796875, "learning_rate": 0.1, "loss": 2.0998048782348633, "step": 7956 }, { "epoch": 0.25263492063492066, "grad_norm": 0.2890625, "learning_rate": 0.1, "loss": 2.1279993057250977, "step": 7958 }, { "epoch": 0.2526984126984127, "grad_norm": 0.1953125, "learning_rate": 0.1, "loss": 2.1212432384490967, "step": 7960 }, { "epoch": 0.25276190476190474, "grad_norm": 0.2138671875, "learning_rate": 0.1, "loss": 2.089418649673462, "step": 7962 }, { "epoch": 0.25282539682539684, "grad_norm": 0.1689453125, "learning_rate": 0.1, "loss": 2.1061694622039795, "step": 7964 }, { "epoch": 0.2528888888888889, "grad_norm": 0.07470703125, "learning_rate": 0.1, "loss": 2.08541202545166, "step": 7966 }, { "epoch": 0.25295238095238093, "grad_norm": 0.138671875, "learning_rate": 0.1, "loss": 2.079415798187256, "step": 7968 }, { "epoch": 0.253015873015873, "grad_norm": 0.14453125, "learning_rate": 0.1, "loss": 2.1145224571228027, "step": 7970 }, { "epoch": 0.25307936507936507, "grad_norm": 0.298828125, "learning_rate": 0.1, "loss": 2.102790355682373, "step": 7972 }, { "epoch": 0.25314285714285717, "grad_norm": 0.267578125, "learning_rate": 0.1, "loss": 2.108755350112915, "step": 7974 }, { "epoch": 0.2532063492063492, "grad_norm": 0.0615234375, "learning_rate": 0.1, "loss": 2.074411392211914, "step": 7976 }, { "epoch": 0.25326984126984126, "grad_norm": 0.1123046875, "learning_rate": 0.1, "loss": 2.075920343399048, "step": 7978 }, { "epoch": 0.25333333333333335, "grad_norm": 0.330078125, "learning_rate": 0.1, "loss": 2.129059076309204, "step": 7980 }, { "epoch": 0.2533968253968254, "grad_norm": 0.197265625, "learning_rate": 0.1, "loss": 2.081261396408081, "step": 7982 }, { "epoch": 0.25346031746031744, "grad_norm": 0.2080078125, "learning_rate": 0.1, "loss": 2.1025359630584717, "step": 7984 }, { "epoch": 0.25352380952380954, "grad_norm": 0.158203125, "learning_rate": 0.1, "loss": 2.0795764923095703, "step": 7986 }, { "epoch": 0.2535873015873016, "grad_norm": 0.052001953125, "learning_rate": 0.1, "loss": 2.1056222915649414, "step": 7988 }, { "epoch": 0.2536507936507936, "grad_norm": 0.107421875, "learning_rate": 0.1, "loss": 2.102372646331787, "step": 7990 }, { "epoch": 0.2537142857142857, "grad_norm": 0.2041015625, "learning_rate": 0.1, "loss": 2.1163816452026367, "step": 7992 }, { "epoch": 0.25377777777777777, "grad_norm": 0.193359375, "learning_rate": 0.1, "loss": 2.0851519107818604, "step": 7994 }, { "epoch": 0.25384126984126987, "grad_norm": 0.11572265625, "learning_rate": 0.1, "loss": 2.0623624324798584, "step": 7996 }, { "epoch": 0.2539047619047619, "grad_norm": 0.126953125, "learning_rate": 0.1, "loss": 2.096719741821289, "step": 7998 }, { "epoch": 0.25396825396825395, "grad_norm": 0.09033203125, "learning_rate": 0.1, "loss": 2.1160264015197754, "step": 8000 }, { "epoch": 0.25403174603174605, "grad_norm": 0.056884765625, "learning_rate": 0.1, "loss": 2.107738733291626, "step": 8002 }, { "epoch": 0.2540952380952381, "grad_norm": 0.2109375, "learning_rate": 0.1, "loss": 2.0620903968811035, "step": 8004 }, { "epoch": 0.25415873015873014, "grad_norm": 0.185546875, "learning_rate": 0.1, "loss": 2.1144566535949707, "step": 8006 }, { "epoch": 0.25422222222222224, "grad_norm": 0.08251953125, "learning_rate": 0.1, "loss": 2.0782837867736816, "step": 8008 }, { "epoch": 0.2542857142857143, "grad_norm": 0.1494140625, "learning_rate": 0.1, "loss": 2.103290557861328, "step": 8010 }, { "epoch": 0.2543492063492063, "grad_norm": 0.263671875, "learning_rate": 0.1, "loss": 2.062232494354248, "step": 8012 }, { "epoch": 0.2544126984126984, "grad_norm": 0.291015625, "learning_rate": 0.1, "loss": 2.070359706878662, "step": 8014 }, { "epoch": 0.25447619047619047, "grad_norm": 0.1318359375, "learning_rate": 0.1, "loss": 2.0512077808380127, "step": 8016 }, { "epoch": 0.25453968253968257, "grad_norm": 0.1533203125, "learning_rate": 0.1, "loss": 2.0814907550811768, "step": 8018 }, { "epoch": 0.2546031746031746, "grad_norm": 0.05419921875, "learning_rate": 0.1, "loss": 2.0676581859588623, "step": 8020 }, { "epoch": 0.25466666666666665, "grad_norm": 0.19140625, "learning_rate": 0.1, "loss": 2.0573537349700928, "step": 8022 }, { "epoch": 0.25473015873015875, "grad_norm": 0.30078125, "learning_rate": 0.1, "loss": 2.1091148853302, "step": 8024 }, { "epoch": 0.2547936507936508, "grad_norm": 0.09912109375, "learning_rate": 0.1, "loss": 2.075312376022339, "step": 8026 }, { "epoch": 0.25485714285714284, "grad_norm": 0.169921875, "learning_rate": 0.1, "loss": 2.0836408138275146, "step": 8028 }, { "epoch": 0.25492063492063494, "grad_norm": 0.2314453125, "learning_rate": 0.1, "loss": 2.0739047527313232, "step": 8030 }, { "epoch": 0.254984126984127, "grad_norm": 0.3203125, "learning_rate": 0.1, "loss": 2.074669361114502, "step": 8032 }, { "epoch": 0.255047619047619, "grad_norm": 0.091796875, "learning_rate": 0.1, "loss": 2.0569419860839844, "step": 8034 }, { "epoch": 0.2551111111111111, "grad_norm": 0.06298828125, "learning_rate": 0.1, "loss": 2.087977886199951, "step": 8036 }, { "epoch": 0.25517460317460317, "grad_norm": 0.1787109375, "learning_rate": 0.1, "loss": 2.048699378967285, "step": 8038 }, { "epoch": 0.25523809523809526, "grad_norm": 0.107421875, "learning_rate": 0.1, "loss": 2.076647996902466, "step": 8040 }, { "epoch": 0.2553015873015873, "grad_norm": 0.07861328125, "learning_rate": 0.1, "loss": 2.0756025314331055, "step": 8042 }, { "epoch": 0.25536507936507935, "grad_norm": 0.1728515625, "learning_rate": 0.1, "loss": 2.0606579780578613, "step": 8044 }, { "epoch": 0.25542857142857145, "grad_norm": 0.1689453125, "learning_rate": 0.1, "loss": 2.083803415298462, "step": 8046 }, { "epoch": 0.2554920634920635, "grad_norm": 0.2119140625, "learning_rate": 0.1, "loss": 2.0946483612060547, "step": 8048 }, { "epoch": 0.25555555555555554, "grad_norm": 0.10791015625, "learning_rate": 0.1, "loss": 2.078752279281616, "step": 8050 }, { "epoch": 0.25561904761904763, "grad_norm": 0.234375, "learning_rate": 0.1, "loss": 2.070709228515625, "step": 8052 }, { "epoch": 0.2556825396825397, "grad_norm": 0.255859375, "learning_rate": 0.1, "loss": 2.0641446113586426, "step": 8054 }, { "epoch": 0.2557460317460317, "grad_norm": 0.08251953125, "learning_rate": 0.1, "loss": 2.0911717414855957, "step": 8056 }, { "epoch": 0.2558095238095238, "grad_norm": 0.09033203125, "learning_rate": 0.1, "loss": 2.1055798530578613, "step": 8058 }, { "epoch": 0.25587301587301586, "grad_norm": 0.0634765625, "learning_rate": 0.1, "loss": 2.09452748298645, "step": 8060 }, { "epoch": 0.25593650793650796, "grad_norm": 0.1796875, "learning_rate": 0.1, "loss": 2.067446708679199, "step": 8062 }, { "epoch": 0.256, "grad_norm": 0.328125, "learning_rate": 0.1, "loss": 2.1161718368530273, "step": 8064 }, { "epoch": 0.25606349206349205, "grad_norm": 0.150390625, "learning_rate": 0.1, "loss": 2.080165386199951, "step": 8066 }, { "epoch": 0.25612698412698415, "grad_norm": 0.07470703125, "learning_rate": 0.1, "loss": 2.0748403072357178, "step": 8068 }, { "epoch": 0.2561904761904762, "grad_norm": 0.244140625, "learning_rate": 0.1, "loss": 2.096667766571045, "step": 8070 }, { "epoch": 0.25625396825396823, "grad_norm": 0.197265625, "learning_rate": 0.1, "loss": 2.082148790359497, "step": 8072 }, { "epoch": 0.25631746031746033, "grad_norm": 0.171875, "learning_rate": 0.1, "loss": 2.1058740615844727, "step": 8074 }, { "epoch": 0.2563809523809524, "grad_norm": 0.271484375, "learning_rate": 0.1, "loss": 2.0829150676727295, "step": 8076 }, { "epoch": 0.2564444444444444, "grad_norm": 0.1591796875, "learning_rate": 0.1, "loss": 2.0994954109191895, "step": 8078 }, { "epoch": 0.2565079365079365, "grad_norm": 0.1318359375, "learning_rate": 0.1, "loss": 2.0763306617736816, "step": 8080 }, { "epoch": 0.25657142857142856, "grad_norm": 0.146484375, "learning_rate": 0.1, "loss": 2.1045217514038086, "step": 8082 }, { "epoch": 0.25663492063492066, "grad_norm": 0.2060546875, "learning_rate": 0.1, "loss": 2.098149061203003, "step": 8084 }, { "epoch": 0.2566984126984127, "grad_norm": 0.173828125, "learning_rate": 0.1, "loss": 2.1057240962982178, "step": 8086 }, { "epoch": 0.25676190476190475, "grad_norm": 0.057373046875, "learning_rate": 0.1, "loss": 2.07600998878479, "step": 8088 }, { "epoch": 0.25682539682539685, "grad_norm": 0.2412109375, "learning_rate": 0.1, "loss": 2.106583833694458, "step": 8090 }, { "epoch": 0.2568888888888889, "grad_norm": 0.31640625, "learning_rate": 0.1, "loss": 2.112182140350342, "step": 8092 }, { "epoch": 0.25695238095238093, "grad_norm": 0.072265625, "learning_rate": 0.1, "loss": 2.0814831256866455, "step": 8094 }, { "epoch": 0.25701587301587303, "grad_norm": 0.0732421875, "learning_rate": 0.1, "loss": 2.1069624423980713, "step": 8096 }, { "epoch": 0.2570793650793651, "grad_norm": 0.052978515625, "learning_rate": 0.1, "loss": 2.0665273666381836, "step": 8098 }, { "epoch": 0.2571428571428571, "grad_norm": 0.1533203125, "learning_rate": 0.1, "loss": 2.078469753265381, "step": 8100 }, { "epoch": 0.2572063492063492, "grad_norm": 0.1962890625, "learning_rate": 0.1, "loss": 2.1210246086120605, "step": 8102 }, { "epoch": 0.25726984126984126, "grad_norm": 0.154296875, "learning_rate": 0.1, "loss": 2.088028907775879, "step": 8104 }, { "epoch": 0.25733333333333336, "grad_norm": 0.1689453125, "learning_rate": 0.1, "loss": 2.075634717941284, "step": 8106 }, { "epoch": 0.2573968253968254, "grad_norm": 0.1171875, "learning_rate": 0.1, "loss": 2.139772653579712, "step": 8108 }, { "epoch": 0.25746031746031744, "grad_norm": 0.173828125, "learning_rate": 0.1, "loss": 2.106640338897705, "step": 8110 }, { "epoch": 0.25752380952380954, "grad_norm": 0.365234375, "learning_rate": 0.1, "loss": 2.132664203643799, "step": 8112 }, { "epoch": 0.2575873015873016, "grad_norm": 0.1708984375, "learning_rate": 0.1, "loss": 2.0857295989990234, "step": 8114 }, { "epoch": 0.25765079365079363, "grad_norm": 0.07177734375, "learning_rate": 0.1, "loss": 2.1078007221221924, "step": 8116 }, { "epoch": 0.25771428571428573, "grad_norm": 0.1171875, "learning_rate": 0.1, "loss": 2.0980477333068848, "step": 8118 }, { "epoch": 0.2577777777777778, "grad_norm": 0.19921875, "learning_rate": 0.1, "loss": 2.083221197128296, "step": 8120 }, { "epoch": 0.2578412698412698, "grad_norm": 0.419921875, "learning_rate": 0.1, "loss": 2.098223924636841, "step": 8122 }, { "epoch": 0.2579047619047619, "grad_norm": 0.17578125, "learning_rate": 0.1, "loss": 2.091862678527832, "step": 8124 }, { "epoch": 0.25796825396825396, "grad_norm": 0.2353515625, "learning_rate": 0.1, "loss": 2.104299783706665, "step": 8126 }, { "epoch": 0.25803174603174606, "grad_norm": 0.115234375, "learning_rate": 0.1, "loss": 2.0934860706329346, "step": 8128 }, { "epoch": 0.2580952380952381, "grad_norm": 0.125, "learning_rate": 0.1, "loss": 2.1711854934692383, "step": 8130 }, { "epoch": 0.25815873015873014, "grad_norm": 0.142578125, "learning_rate": 0.1, "loss": 2.094555377960205, "step": 8132 }, { "epoch": 0.25822222222222224, "grad_norm": 0.1103515625, "learning_rate": 0.1, "loss": 2.0874340534210205, "step": 8134 }, { "epoch": 0.2582857142857143, "grad_norm": 0.1337890625, "learning_rate": 0.1, "loss": 2.077636480331421, "step": 8136 }, { "epoch": 0.25834920634920633, "grad_norm": 0.1962890625, "learning_rate": 0.1, "loss": 2.0905163288116455, "step": 8138 }, { "epoch": 0.2584126984126984, "grad_norm": 0.1455078125, "learning_rate": 0.1, "loss": 2.0905184745788574, "step": 8140 }, { "epoch": 0.25847619047619047, "grad_norm": 0.08251953125, "learning_rate": 0.1, "loss": 2.0940182209014893, "step": 8142 }, { "epoch": 0.2585396825396825, "grad_norm": 0.083984375, "learning_rate": 0.1, "loss": 2.0914087295532227, "step": 8144 }, { "epoch": 0.2586031746031746, "grad_norm": 0.1337890625, "learning_rate": 0.1, "loss": 2.1215593814849854, "step": 8146 }, { "epoch": 0.25866666666666666, "grad_norm": 0.193359375, "learning_rate": 0.1, "loss": 2.110151767730713, "step": 8148 }, { "epoch": 0.25873015873015875, "grad_norm": 0.12451171875, "learning_rate": 0.1, "loss": 2.079010248184204, "step": 8150 }, { "epoch": 0.2587936507936508, "grad_norm": 0.2109375, "learning_rate": 0.1, "loss": 2.086280345916748, "step": 8152 }, { "epoch": 0.25885714285714284, "grad_norm": 0.208984375, "learning_rate": 0.1, "loss": 2.0587618350982666, "step": 8154 }, { "epoch": 0.25892063492063494, "grad_norm": 0.1904296875, "learning_rate": 0.1, "loss": 2.0931544303894043, "step": 8156 }, { "epoch": 0.258984126984127, "grad_norm": 0.1396484375, "learning_rate": 0.1, "loss": 2.098572254180908, "step": 8158 }, { "epoch": 0.259047619047619, "grad_norm": 0.22265625, "learning_rate": 0.1, "loss": 2.1195626258850098, "step": 8160 }, { "epoch": 0.2591111111111111, "grad_norm": 0.44140625, "learning_rate": 0.1, "loss": 2.113557815551758, "step": 8162 }, { "epoch": 0.25917460317460317, "grad_norm": 0.1201171875, "learning_rate": 0.1, "loss": 2.1057817935943604, "step": 8164 }, { "epoch": 0.2592380952380952, "grad_norm": 0.0791015625, "learning_rate": 0.1, "loss": 2.099402904510498, "step": 8166 }, { "epoch": 0.2593015873015873, "grad_norm": 0.08837890625, "learning_rate": 0.1, "loss": 2.118838310241699, "step": 8168 }, { "epoch": 0.25936507936507935, "grad_norm": 0.11865234375, "learning_rate": 0.1, "loss": 2.0910134315490723, "step": 8170 }, { "epoch": 0.25942857142857145, "grad_norm": 0.15234375, "learning_rate": 0.1, "loss": 2.111870288848877, "step": 8172 }, { "epoch": 0.2594920634920635, "grad_norm": 0.2412109375, "learning_rate": 0.1, "loss": 2.0961105823516846, "step": 8174 }, { "epoch": 0.25955555555555554, "grad_norm": 0.380859375, "learning_rate": 0.1, "loss": 2.1171720027923584, "step": 8176 }, { "epoch": 0.25961904761904764, "grad_norm": 0.05517578125, "learning_rate": 0.1, "loss": 2.0985074043273926, "step": 8178 }, { "epoch": 0.2596825396825397, "grad_norm": 0.040283203125, "learning_rate": 0.1, "loss": 2.119133710861206, "step": 8180 }, { "epoch": 0.2597460317460317, "grad_norm": 0.16015625, "learning_rate": 0.1, "loss": 2.076338529586792, "step": 8182 }, { "epoch": 0.2598095238095238, "grad_norm": 0.07958984375, "learning_rate": 0.1, "loss": 2.113738536834717, "step": 8184 }, { "epoch": 0.25987301587301587, "grad_norm": 0.1259765625, "learning_rate": 0.1, "loss": 2.109626293182373, "step": 8186 }, { "epoch": 0.2599365079365079, "grad_norm": 0.1591796875, "learning_rate": 0.1, "loss": 2.1097044944763184, "step": 8188 }, { "epoch": 0.26, "grad_norm": 0.060791015625, "learning_rate": 0.1, "loss": 2.1278395652770996, "step": 8190 }, { "epoch": 0.26006349206349205, "grad_norm": 0.177734375, "learning_rate": 0.1, "loss": 2.1287569999694824, "step": 8192 }, { "epoch": 0.26012698412698415, "grad_norm": 0.11181640625, "learning_rate": 0.1, "loss": 2.0843346118927, "step": 8194 }, { "epoch": 0.2601904761904762, "grad_norm": 0.265625, "learning_rate": 0.1, "loss": 2.0900919437408447, "step": 8196 }, { "epoch": 0.26025396825396824, "grad_norm": 0.27734375, "learning_rate": 0.1, "loss": 2.089617967605591, "step": 8198 }, { "epoch": 0.26031746031746034, "grad_norm": 0.08740234375, "learning_rate": 0.1, "loss": 2.0922770500183105, "step": 8200 }, { "epoch": 0.2603809523809524, "grad_norm": 0.2099609375, "learning_rate": 0.1, "loss": 2.0996437072753906, "step": 8202 }, { "epoch": 0.2604444444444444, "grad_norm": 0.263671875, "learning_rate": 0.1, "loss": 2.0791726112365723, "step": 8204 }, { "epoch": 0.2605079365079365, "grad_norm": 0.10693359375, "learning_rate": 0.1, "loss": 2.109050750732422, "step": 8206 }, { "epoch": 0.26057142857142856, "grad_norm": 0.12255859375, "learning_rate": 0.1, "loss": 2.0832743644714355, "step": 8208 }, { "epoch": 0.2606349206349206, "grad_norm": 0.051513671875, "learning_rate": 0.1, "loss": 2.1103382110595703, "step": 8210 }, { "epoch": 0.2606984126984127, "grad_norm": 0.123046875, "learning_rate": 0.1, "loss": 2.109618663787842, "step": 8212 }, { "epoch": 0.26076190476190475, "grad_norm": 0.119140625, "learning_rate": 0.1, "loss": 2.153921365737915, "step": 8214 }, { "epoch": 0.26082539682539685, "grad_norm": 0.20703125, "learning_rate": 0.1, "loss": 2.1183905601501465, "step": 8216 }, { "epoch": 0.2608888888888889, "grad_norm": 0.302734375, "learning_rate": 0.1, "loss": 2.0681939125061035, "step": 8218 }, { "epoch": 0.26095238095238094, "grad_norm": 0.166015625, "learning_rate": 0.1, "loss": 2.0965750217437744, "step": 8220 }, { "epoch": 0.26101587301587303, "grad_norm": 0.11376953125, "learning_rate": 0.1, "loss": 2.116101026535034, "step": 8222 }, { "epoch": 0.2610793650793651, "grad_norm": 0.06396484375, "learning_rate": 0.1, "loss": 2.1299855709075928, "step": 8224 }, { "epoch": 0.2611428571428571, "grad_norm": 0.1630859375, "learning_rate": 0.1, "loss": 2.1272051334381104, "step": 8226 }, { "epoch": 0.2612063492063492, "grad_norm": 0.1357421875, "learning_rate": 0.1, "loss": 2.128498077392578, "step": 8228 }, { "epoch": 0.26126984126984126, "grad_norm": 0.0859375, "learning_rate": 0.1, "loss": 2.1338417530059814, "step": 8230 }, { "epoch": 0.2613333333333333, "grad_norm": 0.119140625, "learning_rate": 0.1, "loss": 2.1035356521606445, "step": 8232 }, { "epoch": 0.2613968253968254, "grad_norm": 0.32421875, "learning_rate": 0.1, "loss": 2.115013360977173, "step": 8234 }, { "epoch": 0.26146031746031745, "grad_norm": 0.55859375, "learning_rate": 0.1, "loss": 2.1498680114746094, "step": 8236 }, { "epoch": 0.26152380952380955, "grad_norm": 0.1005859375, "learning_rate": 0.1, "loss": 2.1168999671936035, "step": 8238 }, { "epoch": 0.2615873015873016, "grad_norm": 0.1796875, "learning_rate": 0.1, "loss": 2.1404194831848145, "step": 8240 }, { "epoch": 0.26165079365079363, "grad_norm": 0.07470703125, "learning_rate": 0.1, "loss": 2.118223190307617, "step": 8242 }, { "epoch": 0.26171428571428573, "grad_norm": 0.072265625, "learning_rate": 0.1, "loss": 2.131054639816284, "step": 8244 }, { "epoch": 0.2617777777777778, "grad_norm": 0.1484375, "learning_rate": 0.1, "loss": 2.1190361976623535, "step": 8246 }, { "epoch": 0.2618412698412698, "grad_norm": 0.1064453125, "learning_rate": 0.1, "loss": 2.1467514038085938, "step": 8248 }, { "epoch": 0.2619047619047619, "grad_norm": 0.059326171875, "learning_rate": 0.1, "loss": 2.091456651687622, "step": 8250 }, { "epoch": 0.26196825396825396, "grad_norm": 0.09765625, "learning_rate": 0.1, "loss": 2.1232292652130127, "step": 8252 }, { "epoch": 0.262031746031746, "grad_norm": 0.19921875, "learning_rate": 0.1, "loss": 2.1226489543914795, "step": 8254 }, { "epoch": 0.2620952380952381, "grad_norm": 0.1640625, "learning_rate": 0.1, "loss": 2.1117196083068848, "step": 8256 }, { "epoch": 0.26215873015873015, "grad_norm": 0.30078125, "learning_rate": 0.1, "loss": 2.1011109352111816, "step": 8258 }, { "epoch": 0.26222222222222225, "grad_norm": 0.1201171875, "learning_rate": 0.1, "loss": 2.1351263523101807, "step": 8260 }, { "epoch": 0.2622857142857143, "grad_norm": 0.1962890625, "learning_rate": 0.1, "loss": 2.1273605823516846, "step": 8262 }, { "epoch": 0.26234920634920633, "grad_norm": 0.17578125, "learning_rate": 0.1, "loss": 2.1131889820098877, "step": 8264 }, { "epoch": 0.26241269841269843, "grad_norm": 0.08251953125, "learning_rate": 0.1, "loss": 2.122616767883301, "step": 8266 }, { "epoch": 0.2624761904761905, "grad_norm": 0.0712890625, "learning_rate": 0.1, "loss": 2.122109889984131, "step": 8268 }, { "epoch": 0.2625396825396825, "grad_norm": 0.1474609375, "learning_rate": 0.1, "loss": 2.1247549057006836, "step": 8270 }, { "epoch": 0.2626031746031746, "grad_norm": 0.1767578125, "learning_rate": 0.1, "loss": 2.1268322467803955, "step": 8272 }, { "epoch": 0.26266666666666666, "grad_norm": 0.197265625, "learning_rate": 0.1, "loss": 2.1431219577789307, "step": 8274 }, { "epoch": 0.2627301587301587, "grad_norm": 0.10400390625, "learning_rate": 0.1, "loss": 2.123469591140747, "step": 8276 }, { "epoch": 0.2627936507936508, "grad_norm": 0.322265625, "learning_rate": 0.1, "loss": 2.155906915664673, "step": 8278 }, { "epoch": 0.26285714285714284, "grad_norm": 0.38671875, "learning_rate": 0.1, "loss": 2.1168787479400635, "step": 8280 }, { "epoch": 0.26292063492063494, "grad_norm": 0.1884765625, "learning_rate": 0.1, "loss": 2.1293318271636963, "step": 8282 }, { "epoch": 0.262984126984127, "grad_norm": 0.2294921875, "learning_rate": 0.1, "loss": 2.1414129734039307, "step": 8284 }, { "epoch": 0.26304761904761903, "grad_norm": 0.0849609375, "learning_rate": 0.1, "loss": 2.111966133117676, "step": 8286 }, { "epoch": 0.26311111111111113, "grad_norm": 0.107421875, "learning_rate": 0.1, "loss": 2.117311716079712, "step": 8288 }, { "epoch": 0.26317460317460317, "grad_norm": 0.054443359375, "learning_rate": 0.1, "loss": 2.1452598571777344, "step": 8290 }, { "epoch": 0.2632380952380952, "grad_norm": 0.1640625, "learning_rate": 0.1, "loss": 2.1221444606781006, "step": 8292 }, { "epoch": 0.2633015873015873, "grad_norm": 0.09228515625, "learning_rate": 0.1, "loss": 2.1219358444213867, "step": 8294 }, { "epoch": 0.26336507936507936, "grad_norm": 0.1201171875, "learning_rate": 0.1, "loss": 2.1338047981262207, "step": 8296 }, { "epoch": 0.2634285714285714, "grad_norm": 0.125, "learning_rate": 0.1, "loss": 2.144634246826172, "step": 8298 }, { "epoch": 0.2634920634920635, "grad_norm": 0.1650390625, "learning_rate": 0.1, "loss": 2.108712911605835, "step": 8300 }, { "epoch": 0.26355555555555554, "grad_norm": 0.2373046875, "learning_rate": 0.1, "loss": 2.120450735092163, "step": 8302 }, { "epoch": 0.26361904761904764, "grad_norm": 0.169921875, "learning_rate": 0.1, "loss": 2.1316046714782715, "step": 8304 }, { "epoch": 0.2636825396825397, "grad_norm": 0.09375, "learning_rate": 0.1, "loss": 2.089925765991211, "step": 8306 }, { "epoch": 0.26374603174603173, "grad_norm": 0.2060546875, "learning_rate": 0.1, "loss": 2.1459388732910156, "step": 8308 }, { "epoch": 0.2638095238095238, "grad_norm": 0.2158203125, "learning_rate": 0.1, "loss": 2.1347081661224365, "step": 8310 }, { "epoch": 0.26387301587301587, "grad_norm": 0.08740234375, "learning_rate": 0.1, "loss": 2.092588424682617, "step": 8312 }, { "epoch": 0.2639365079365079, "grad_norm": 0.291015625, "learning_rate": 0.1, "loss": 2.1310856342315674, "step": 8314 }, { "epoch": 0.264, "grad_norm": 0.177734375, "learning_rate": 0.1, "loss": 2.1174092292785645, "step": 8316 }, { "epoch": 0.26406349206349206, "grad_norm": 0.125, "learning_rate": 0.1, "loss": 2.0907092094421387, "step": 8318 }, { "epoch": 0.26412698412698415, "grad_norm": 0.064453125, "learning_rate": 0.1, "loss": 2.116969347000122, "step": 8320 }, { "epoch": 0.2641904761904762, "grad_norm": 0.08203125, "learning_rate": 0.1, "loss": 2.123289108276367, "step": 8322 }, { "epoch": 0.26425396825396824, "grad_norm": 0.294921875, "learning_rate": 0.1, "loss": 2.0935895442962646, "step": 8324 }, { "epoch": 0.26431746031746034, "grad_norm": 0.1748046875, "learning_rate": 0.1, "loss": 2.0774943828582764, "step": 8326 }, { "epoch": 0.2643809523809524, "grad_norm": 0.06640625, "learning_rate": 0.1, "loss": 2.12845778465271, "step": 8328 }, { "epoch": 0.2644444444444444, "grad_norm": 0.12158203125, "learning_rate": 0.1, "loss": 2.1092987060546875, "step": 8330 }, { "epoch": 0.2645079365079365, "grad_norm": 0.1435546875, "learning_rate": 0.1, "loss": 2.127121686935425, "step": 8332 }, { "epoch": 0.26457142857142857, "grad_norm": 0.39453125, "learning_rate": 0.1, "loss": 2.1184210777282715, "step": 8334 }, { "epoch": 0.2646349206349206, "grad_norm": 0.1494140625, "learning_rate": 0.1, "loss": 2.127631425857544, "step": 8336 }, { "epoch": 0.2646984126984127, "grad_norm": 0.2353515625, "learning_rate": 0.1, "loss": 2.1090471744537354, "step": 8338 }, { "epoch": 0.26476190476190475, "grad_norm": 0.10400390625, "learning_rate": 0.1, "loss": 2.110409736633301, "step": 8340 }, { "epoch": 0.26482539682539685, "grad_norm": 0.2236328125, "learning_rate": 0.1, "loss": 2.1268885135650635, "step": 8342 }, { "epoch": 0.2648888888888889, "grad_norm": 0.126953125, "learning_rate": 0.1, "loss": 2.099311113357544, "step": 8344 }, { "epoch": 0.26495238095238094, "grad_norm": 0.1025390625, "learning_rate": 0.1, "loss": 2.1144416332244873, "step": 8346 }, { "epoch": 0.26501587301587304, "grad_norm": 0.08251953125, "learning_rate": 0.1, "loss": 2.1316823959350586, "step": 8348 }, { "epoch": 0.2650793650793651, "grad_norm": 0.10986328125, "learning_rate": 0.1, "loss": 2.1064064502716064, "step": 8350 }, { "epoch": 0.2651428571428571, "grad_norm": 0.1513671875, "learning_rate": 0.1, "loss": 2.1045873165130615, "step": 8352 }, { "epoch": 0.2652063492063492, "grad_norm": 0.1962890625, "learning_rate": 0.1, "loss": 2.1169402599334717, "step": 8354 }, { "epoch": 0.26526984126984127, "grad_norm": 0.07080078125, "learning_rate": 0.1, "loss": 2.109435796737671, "step": 8356 }, { "epoch": 0.2653333333333333, "grad_norm": 0.06689453125, "learning_rate": 0.1, "loss": 2.094067335128784, "step": 8358 }, { "epoch": 0.2653968253968254, "grad_norm": 0.2041015625, "learning_rate": 0.1, "loss": 2.1232032775878906, "step": 8360 }, { "epoch": 0.26546031746031745, "grad_norm": 0.349609375, "learning_rate": 0.1, "loss": 2.076899290084839, "step": 8362 }, { "epoch": 0.26552380952380955, "grad_norm": 0.125, "learning_rate": 0.1, "loss": 2.0631186962127686, "step": 8364 }, { "epoch": 0.2655873015873016, "grad_norm": 0.1796875, "learning_rate": 0.1, "loss": 2.129593849182129, "step": 8366 }, { "epoch": 0.26565079365079364, "grad_norm": 0.21484375, "learning_rate": 0.1, "loss": 2.106175422668457, "step": 8368 }, { "epoch": 0.26571428571428574, "grad_norm": 0.09228515625, "learning_rate": 0.1, "loss": 2.119671583175659, "step": 8370 }, { "epoch": 0.2657777777777778, "grad_norm": 0.109375, "learning_rate": 0.1, "loss": 2.100867986679077, "step": 8372 }, { "epoch": 0.2658412698412698, "grad_norm": 0.28515625, "learning_rate": 0.1, "loss": 2.130281448364258, "step": 8374 }, { "epoch": 0.2659047619047619, "grad_norm": 0.232421875, "learning_rate": 0.1, "loss": 2.13114595413208, "step": 8376 }, { "epoch": 0.26596825396825396, "grad_norm": 0.091796875, "learning_rate": 0.1, "loss": 2.10205078125, "step": 8378 }, { "epoch": 0.266031746031746, "grad_norm": 0.201171875, "learning_rate": 0.1, "loss": 2.1338860988616943, "step": 8380 }, { "epoch": 0.2660952380952381, "grad_norm": 0.1953125, "learning_rate": 0.1, "loss": 2.10996675491333, "step": 8382 }, { "epoch": 0.26615873015873015, "grad_norm": 0.09814453125, "learning_rate": 0.1, "loss": 2.1046416759490967, "step": 8384 }, { "epoch": 0.26622222222222225, "grad_norm": 0.12890625, "learning_rate": 0.1, "loss": 2.098137140274048, "step": 8386 }, { "epoch": 0.2662857142857143, "grad_norm": 0.1826171875, "learning_rate": 0.1, "loss": 2.0943901538848877, "step": 8388 }, { "epoch": 0.26634920634920634, "grad_norm": 0.23046875, "learning_rate": 0.1, "loss": 2.122483253479004, "step": 8390 }, { "epoch": 0.26641269841269843, "grad_norm": 0.1767578125, "learning_rate": 0.1, "loss": 2.1463913917541504, "step": 8392 }, { "epoch": 0.2664761904761905, "grad_norm": 0.09130859375, "learning_rate": 0.1, "loss": 2.1061367988586426, "step": 8394 }, { "epoch": 0.2665396825396825, "grad_norm": 0.1591796875, "learning_rate": 0.1, "loss": 2.118013620376587, "step": 8396 }, { "epoch": 0.2666031746031746, "grad_norm": 0.0546875, "learning_rate": 0.1, "loss": 2.123303174972534, "step": 8398 }, { "epoch": 0.26666666666666666, "grad_norm": 0.09375, "learning_rate": 0.1, "loss": 2.1308352947235107, "step": 8400 }, { "epoch": 0.2667301587301587, "grad_norm": 0.08154296875, "learning_rate": 0.1, "loss": 2.111340284347534, "step": 8402 }, { "epoch": 0.2667936507936508, "grad_norm": 0.2001953125, "learning_rate": 0.1, "loss": 2.114323854446411, "step": 8404 }, { "epoch": 0.26685714285714285, "grad_norm": 0.28515625, "learning_rate": 0.1, "loss": 2.1177074909210205, "step": 8406 }, { "epoch": 0.26692063492063495, "grad_norm": 0.337890625, "learning_rate": 0.1, "loss": 2.139169692993164, "step": 8408 }, { "epoch": 0.266984126984127, "grad_norm": 0.08642578125, "learning_rate": 0.1, "loss": 2.147883176803589, "step": 8410 }, { "epoch": 0.26704761904761903, "grad_norm": 0.248046875, "learning_rate": 0.1, "loss": 2.138007640838623, "step": 8412 }, { "epoch": 0.26711111111111113, "grad_norm": 0.11767578125, "learning_rate": 0.1, "loss": 2.117086887359619, "step": 8414 }, { "epoch": 0.2671746031746032, "grad_norm": 0.08447265625, "learning_rate": 0.1, "loss": 2.1543028354644775, "step": 8416 }, { "epoch": 0.2672380952380952, "grad_norm": 0.16015625, "learning_rate": 0.1, "loss": 2.1271450519561768, "step": 8418 }, { "epoch": 0.2673015873015873, "grad_norm": 0.07470703125, "learning_rate": 0.1, "loss": 2.1434197425842285, "step": 8420 }, { "epoch": 0.26736507936507936, "grad_norm": 0.11083984375, "learning_rate": 0.1, "loss": 2.1558046340942383, "step": 8422 }, { "epoch": 0.2674285714285714, "grad_norm": 0.1650390625, "learning_rate": 0.1, "loss": 2.140834331512451, "step": 8424 }, { "epoch": 0.2674920634920635, "grad_norm": 0.498046875, "learning_rate": 0.1, "loss": 2.1117613315582275, "step": 8426 }, { "epoch": 0.26755555555555555, "grad_norm": 0.234375, "learning_rate": 0.1, "loss": 2.1366260051727295, "step": 8428 }, { "epoch": 0.26761904761904765, "grad_norm": 0.1171875, "learning_rate": 0.1, "loss": 2.1364035606384277, "step": 8430 }, { "epoch": 0.2676825396825397, "grad_norm": 0.1162109375, "learning_rate": 0.1, "loss": 2.153635263442993, "step": 8432 }, { "epoch": 0.26774603174603173, "grad_norm": 0.1484375, "learning_rate": 0.1, "loss": 2.1148746013641357, "step": 8434 }, { "epoch": 0.26780952380952383, "grad_norm": 0.10693359375, "learning_rate": 0.1, "loss": 2.1311771869659424, "step": 8436 }, { "epoch": 0.2678730158730159, "grad_norm": 0.062255859375, "learning_rate": 0.1, "loss": 2.10973858833313, "step": 8438 }, { "epoch": 0.2679365079365079, "grad_norm": 0.130859375, "learning_rate": 0.1, "loss": 2.1464195251464844, "step": 8440 }, { "epoch": 0.268, "grad_norm": 0.1044921875, "learning_rate": 0.1, "loss": 2.1309762001037598, "step": 8442 }, { "epoch": 0.26806349206349206, "grad_norm": 0.287109375, "learning_rate": 0.1, "loss": 2.1188669204711914, "step": 8444 }, { "epoch": 0.2681269841269841, "grad_norm": 0.130859375, "learning_rate": 0.1, "loss": 2.1132168769836426, "step": 8446 }, { "epoch": 0.2681904761904762, "grad_norm": 0.169921875, "learning_rate": 0.1, "loss": 2.160536527633667, "step": 8448 }, { "epoch": 0.26825396825396824, "grad_norm": 0.353515625, "learning_rate": 0.1, "loss": 2.1440067291259766, "step": 8450 }, { "epoch": 0.26831746031746034, "grad_norm": 0.08935546875, "learning_rate": 0.1, "loss": 2.142578125, "step": 8452 }, { "epoch": 0.2683809523809524, "grad_norm": 0.06591796875, "learning_rate": 0.1, "loss": 2.1475629806518555, "step": 8454 }, { "epoch": 0.26844444444444443, "grad_norm": 0.06298828125, "learning_rate": 0.1, "loss": 2.1245362758636475, "step": 8456 }, { "epoch": 0.26850793650793653, "grad_norm": 0.1982421875, "learning_rate": 0.1, "loss": 2.12142014503479, "step": 8458 }, { "epoch": 0.26857142857142857, "grad_norm": 0.240234375, "learning_rate": 0.1, "loss": 2.124265670776367, "step": 8460 }, { "epoch": 0.2686349206349206, "grad_norm": 0.306640625, "learning_rate": 0.1, "loss": 2.133720636367798, "step": 8462 }, { "epoch": 0.2686984126984127, "grad_norm": 0.087890625, "learning_rate": 0.1, "loss": 2.096112012863159, "step": 8464 }, { "epoch": 0.26876190476190476, "grad_norm": 0.1435546875, "learning_rate": 0.1, "loss": 2.136575937271118, "step": 8466 }, { "epoch": 0.2688253968253968, "grad_norm": 0.078125, "learning_rate": 0.1, "loss": 2.136136770248413, "step": 8468 }, { "epoch": 0.2688888888888889, "grad_norm": 0.1396484375, "learning_rate": 0.1, "loss": 2.1376519203186035, "step": 8470 }, { "epoch": 0.26895238095238094, "grad_norm": 0.1279296875, "learning_rate": 0.1, "loss": 2.115398406982422, "step": 8472 }, { "epoch": 0.26901587301587304, "grad_norm": 0.1416015625, "learning_rate": 0.1, "loss": 2.1332530975341797, "step": 8474 }, { "epoch": 0.2690793650793651, "grad_norm": 0.1728515625, "learning_rate": 0.1, "loss": 2.157287120819092, "step": 8476 }, { "epoch": 0.26914285714285713, "grad_norm": 0.1494140625, "learning_rate": 0.1, "loss": 2.126091241836548, "step": 8478 }, { "epoch": 0.2692063492063492, "grad_norm": 0.1767578125, "learning_rate": 0.1, "loss": 2.1376101970672607, "step": 8480 }, { "epoch": 0.26926984126984127, "grad_norm": 0.20703125, "learning_rate": 0.1, "loss": 2.1525826454162598, "step": 8482 }, { "epoch": 0.2693333333333333, "grad_norm": 0.11083984375, "learning_rate": 0.1, "loss": 2.1410980224609375, "step": 8484 }, { "epoch": 0.2693968253968254, "grad_norm": 0.054443359375, "learning_rate": 0.1, "loss": 2.1469180583953857, "step": 8486 }, { "epoch": 0.26946031746031746, "grad_norm": 0.107421875, "learning_rate": 0.1, "loss": 2.104678153991699, "step": 8488 }, { "epoch": 0.2695238095238095, "grad_norm": 0.046630859375, "learning_rate": 0.1, "loss": 2.0924463272094727, "step": 8490 }, { "epoch": 0.2695873015873016, "grad_norm": 0.142578125, "learning_rate": 0.1, "loss": 2.116642951965332, "step": 8492 }, { "epoch": 0.26965079365079364, "grad_norm": 0.158203125, "learning_rate": 0.1, "loss": 2.1545088291168213, "step": 8494 }, { "epoch": 0.26971428571428574, "grad_norm": 0.408203125, "learning_rate": 0.1, "loss": 2.150811195373535, "step": 8496 }, { "epoch": 0.2697777777777778, "grad_norm": 0.625, "learning_rate": 0.1, "loss": 2.116386890411377, "step": 8498 }, { "epoch": 0.2698412698412698, "grad_norm": 0.15625, "learning_rate": 0.1, "loss": 2.1447179317474365, "step": 8500 }, { "epoch": 0.2699047619047619, "grad_norm": 0.11474609375, "learning_rate": 0.1, "loss": 2.1437106132507324, "step": 8502 }, { "epoch": 0.26996825396825397, "grad_norm": 0.078125, "learning_rate": 0.1, "loss": 2.1249139308929443, "step": 8504 }, { "epoch": 0.270031746031746, "grad_norm": 0.275390625, "learning_rate": 0.1, "loss": 2.1368446350097656, "step": 8506 }, { "epoch": 0.2700952380952381, "grad_norm": 0.15625, "learning_rate": 0.1, "loss": 2.1275832653045654, "step": 8508 }, { "epoch": 0.27015873015873015, "grad_norm": 0.1279296875, "learning_rate": 0.1, "loss": 2.1269614696502686, "step": 8510 }, { "epoch": 0.2702222222222222, "grad_norm": 0.05224609375, "learning_rate": 0.1, "loss": 2.100285291671753, "step": 8512 }, { "epoch": 0.2702857142857143, "grad_norm": 0.051025390625, "learning_rate": 0.1, "loss": 2.1286253929138184, "step": 8514 }, { "epoch": 0.27034920634920634, "grad_norm": 0.146484375, "learning_rate": 0.1, "loss": 2.1246466636657715, "step": 8516 }, { "epoch": 0.27041269841269844, "grad_norm": 0.27734375, "learning_rate": 0.1, "loss": 2.1177847385406494, "step": 8518 }, { "epoch": 0.2704761904761905, "grad_norm": 0.197265625, "learning_rate": 0.1, "loss": 2.1174726486206055, "step": 8520 }, { "epoch": 0.2705396825396825, "grad_norm": 0.07080078125, "learning_rate": 0.1, "loss": 2.15252685546875, "step": 8522 }, { "epoch": 0.2706031746031746, "grad_norm": 0.11279296875, "learning_rate": 0.1, "loss": 2.1377549171447754, "step": 8524 }, { "epoch": 0.27066666666666667, "grad_norm": 0.2041015625, "learning_rate": 0.1, "loss": 2.1440718173980713, "step": 8526 }, { "epoch": 0.2707301587301587, "grad_norm": 0.1103515625, "learning_rate": 0.1, "loss": 2.1254405975341797, "step": 8528 }, { "epoch": 0.2707936507936508, "grad_norm": 0.2265625, "learning_rate": 0.1, "loss": 2.155799150466919, "step": 8530 }, { "epoch": 0.27085714285714285, "grad_norm": 0.201171875, "learning_rate": 0.1, "loss": 2.1237502098083496, "step": 8532 }, { "epoch": 0.2709206349206349, "grad_norm": 0.0888671875, "learning_rate": 0.1, "loss": 2.134565591812134, "step": 8534 }, { "epoch": 0.270984126984127, "grad_norm": 0.060791015625, "learning_rate": 0.1, "loss": 2.1404709815979004, "step": 8536 }, { "epoch": 0.27104761904761904, "grad_norm": 0.1279296875, "learning_rate": 0.1, "loss": 2.148927688598633, "step": 8538 }, { "epoch": 0.27111111111111114, "grad_norm": 0.337890625, "learning_rate": 0.1, "loss": 2.13960599899292, "step": 8540 }, { "epoch": 0.2711746031746032, "grad_norm": 0.150390625, "learning_rate": 0.1, "loss": 2.126688241958618, "step": 8542 }, { "epoch": 0.2712380952380952, "grad_norm": 0.08154296875, "learning_rate": 0.1, "loss": 2.1396687030792236, "step": 8544 }, { "epoch": 0.2713015873015873, "grad_norm": 0.134765625, "learning_rate": 0.1, "loss": 2.12801194190979, "step": 8546 }, { "epoch": 0.27136507936507936, "grad_norm": 0.05712890625, "learning_rate": 0.1, "loss": 2.135812520980835, "step": 8548 }, { "epoch": 0.2714285714285714, "grad_norm": 0.1171875, "learning_rate": 0.1, "loss": 2.1147050857543945, "step": 8550 }, { "epoch": 0.2714920634920635, "grad_norm": 0.1357421875, "learning_rate": 0.1, "loss": 2.119969367980957, "step": 8552 }, { "epoch": 0.27155555555555555, "grad_norm": 0.08740234375, "learning_rate": 0.1, "loss": 2.1262426376342773, "step": 8554 }, { "epoch": 0.2716190476190476, "grad_norm": 0.150390625, "learning_rate": 0.1, "loss": 2.139942169189453, "step": 8556 }, { "epoch": 0.2716825396825397, "grad_norm": 0.1943359375, "learning_rate": 0.1, "loss": 2.097947835922241, "step": 8558 }, { "epoch": 0.27174603174603174, "grad_norm": 0.384765625, "learning_rate": 0.1, "loss": 2.1339030265808105, "step": 8560 }, { "epoch": 0.27180952380952383, "grad_norm": 0.1455078125, "learning_rate": 0.1, "loss": 2.1460084915161133, "step": 8562 }, { "epoch": 0.2718730158730159, "grad_norm": 0.1943359375, "learning_rate": 0.1, "loss": 2.112062931060791, "step": 8564 }, { "epoch": 0.2719365079365079, "grad_norm": 0.103515625, "learning_rate": 0.1, "loss": 2.1121580600738525, "step": 8566 }, { "epoch": 0.272, "grad_norm": 0.1943359375, "learning_rate": 0.1, "loss": 2.1036150455474854, "step": 8568 }, { "epoch": 0.27206349206349206, "grad_norm": 0.287109375, "learning_rate": 0.1, "loss": 2.134664535522461, "step": 8570 }, { "epoch": 0.2721269841269841, "grad_norm": 0.10107421875, "learning_rate": 0.1, "loss": 2.1052119731903076, "step": 8572 }, { "epoch": 0.2721904761904762, "grad_norm": 0.06640625, "learning_rate": 0.1, "loss": 2.14681077003479, "step": 8574 }, { "epoch": 0.27225396825396825, "grad_norm": 0.068359375, "learning_rate": 0.1, "loss": 2.117945671081543, "step": 8576 }, { "epoch": 0.2723174603174603, "grad_norm": 0.2265625, "learning_rate": 0.1, "loss": 2.1313278675079346, "step": 8578 }, { "epoch": 0.2723809523809524, "grad_norm": 0.11279296875, "learning_rate": 0.1, "loss": 2.1041481494903564, "step": 8580 }, { "epoch": 0.27244444444444443, "grad_norm": 0.1015625, "learning_rate": 0.1, "loss": 2.128117561340332, "step": 8582 }, { "epoch": 0.27250793650793653, "grad_norm": 0.0439453125, "learning_rate": 0.1, "loss": 2.135561466217041, "step": 8584 }, { "epoch": 0.2725714285714286, "grad_norm": 0.048583984375, "learning_rate": 0.1, "loss": 2.131937026977539, "step": 8586 }, { "epoch": 0.2726349206349206, "grad_norm": 0.12451171875, "learning_rate": 0.1, "loss": 2.0996110439300537, "step": 8588 }, { "epoch": 0.2726984126984127, "grad_norm": 0.34765625, "learning_rate": 0.1, "loss": 2.1476497650146484, "step": 8590 }, { "epoch": 0.27276190476190476, "grad_norm": 0.2490234375, "learning_rate": 0.1, "loss": 2.1423418521881104, "step": 8592 }, { "epoch": 0.2728253968253968, "grad_norm": 0.1884765625, "learning_rate": 0.1, "loss": 2.143872022628784, "step": 8594 }, { "epoch": 0.2728888888888889, "grad_norm": 0.0546875, "learning_rate": 0.1, "loss": 2.1375913619995117, "step": 8596 }, { "epoch": 0.27295238095238095, "grad_norm": 0.279296875, "learning_rate": 0.1, "loss": 2.1452834606170654, "step": 8598 }, { "epoch": 0.273015873015873, "grad_norm": 0.263671875, "learning_rate": 0.1, "loss": 2.078801155090332, "step": 8600 }, { "epoch": 0.2730793650793651, "grad_norm": 0.09619140625, "learning_rate": 0.1, "loss": 2.137643575668335, "step": 8602 }, { "epoch": 0.27314285714285713, "grad_norm": 0.185546875, "learning_rate": 0.1, "loss": 2.100147247314453, "step": 8604 }, { "epoch": 0.27320634920634923, "grad_norm": 0.10107421875, "learning_rate": 0.1, "loss": 2.1014978885650635, "step": 8606 }, { "epoch": 0.2732698412698413, "grad_norm": 0.10107421875, "learning_rate": 0.1, "loss": 2.104013442993164, "step": 8608 }, { "epoch": 0.2733333333333333, "grad_norm": 0.09912109375, "learning_rate": 0.1, "loss": 2.097313165664673, "step": 8610 }, { "epoch": 0.2733968253968254, "grad_norm": 0.13671875, "learning_rate": 0.1, "loss": 2.13362979888916, "step": 8612 }, { "epoch": 0.27346031746031746, "grad_norm": 0.31640625, "learning_rate": 0.1, "loss": 2.115877151489258, "step": 8614 }, { "epoch": 0.2735238095238095, "grad_norm": 0.11572265625, "learning_rate": 0.1, "loss": 2.145993947982788, "step": 8616 }, { "epoch": 0.2735873015873016, "grad_norm": 0.0654296875, "learning_rate": 0.1, "loss": 2.1195523738861084, "step": 8618 }, { "epoch": 0.27365079365079364, "grad_norm": 0.08251953125, "learning_rate": 0.1, "loss": 2.132167339324951, "step": 8620 }, { "epoch": 0.2737142857142857, "grad_norm": 0.171875, "learning_rate": 0.1, "loss": 2.1215713024139404, "step": 8622 }, { "epoch": 0.2737777777777778, "grad_norm": 0.33203125, "learning_rate": 0.1, "loss": 2.1349964141845703, "step": 8624 }, { "epoch": 0.27384126984126983, "grad_norm": 0.0791015625, "learning_rate": 0.1, "loss": 2.093269109725952, "step": 8626 }, { "epoch": 0.27390476190476193, "grad_norm": 0.1845703125, "learning_rate": 0.1, "loss": 2.1247639656066895, "step": 8628 }, { "epoch": 0.27396825396825397, "grad_norm": 0.064453125, "learning_rate": 0.1, "loss": 2.1117467880249023, "step": 8630 }, { "epoch": 0.274031746031746, "grad_norm": 0.09375, "learning_rate": 0.1, "loss": 2.1069135665893555, "step": 8632 }, { "epoch": 0.2740952380952381, "grad_norm": 0.1259765625, "learning_rate": 0.1, "loss": 2.1482911109924316, "step": 8634 }, { "epoch": 0.27415873015873016, "grad_norm": 0.169921875, "learning_rate": 0.1, "loss": 2.105473518371582, "step": 8636 }, { "epoch": 0.2742222222222222, "grad_norm": 0.123046875, "learning_rate": 0.1, "loss": 2.1212382316589355, "step": 8638 }, { "epoch": 0.2742857142857143, "grad_norm": 0.095703125, "learning_rate": 0.1, "loss": 2.101505994796753, "step": 8640 }, { "epoch": 0.27434920634920634, "grad_norm": 0.255859375, "learning_rate": 0.1, "loss": 2.102080821990967, "step": 8642 }, { "epoch": 0.2744126984126984, "grad_norm": 0.212890625, "learning_rate": 0.1, "loss": 2.103355884552002, "step": 8644 }, { "epoch": 0.2744761904761905, "grad_norm": 0.04931640625, "learning_rate": 0.1, "loss": 2.1178579330444336, "step": 8646 }, { "epoch": 0.27453968253968253, "grad_norm": 0.05615234375, "learning_rate": 0.1, "loss": 2.12017822265625, "step": 8648 }, { "epoch": 0.2746031746031746, "grad_norm": 0.1572265625, "learning_rate": 0.1, "loss": 2.0837507247924805, "step": 8650 }, { "epoch": 0.27466666666666667, "grad_norm": 0.37890625, "learning_rate": 0.1, "loss": 2.1261327266693115, "step": 8652 }, { "epoch": 0.2747301587301587, "grad_norm": 0.37109375, "learning_rate": 0.1, "loss": 2.1251444816589355, "step": 8654 }, { "epoch": 0.2747936507936508, "grad_norm": 0.1865234375, "learning_rate": 0.1, "loss": 2.098879337310791, "step": 8656 }, { "epoch": 0.27485714285714286, "grad_norm": 0.1787109375, "learning_rate": 0.1, "loss": 2.129762649536133, "step": 8658 }, { "epoch": 0.2749206349206349, "grad_norm": 0.1142578125, "learning_rate": 0.1, "loss": 2.1341512203216553, "step": 8660 }, { "epoch": 0.274984126984127, "grad_norm": 0.126953125, "learning_rate": 0.1, "loss": 2.11061954498291, "step": 8662 }, { "epoch": 0.27504761904761904, "grad_norm": 0.1748046875, "learning_rate": 0.1, "loss": 2.134126901626587, "step": 8664 }, { "epoch": 0.2751111111111111, "grad_norm": 0.205078125, "learning_rate": 0.1, "loss": 2.108114719390869, "step": 8666 }, { "epoch": 0.2751746031746032, "grad_norm": 0.0888671875, "learning_rate": 0.1, "loss": 2.1089553833007812, "step": 8668 }, { "epoch": 0.2752380952380952, "grad_norm": 0.2421875, "learning_rate": 0.1, "loss": 2.106834888458252, "step": 8670 }, { "epoch": 0.2753015873015873, "grad_norm": 0.224609375, "learning_rate": 0.1, "loss": 2.125664710998535, "step": 8672 }, { "epoch": 0.27536507936507937, "grad_norm": 0.1123046875, "learning_rate": 0.1, "loss": 2.1441173553466797, "step": 8674 }, { "epoch": 0.2754285714285714, "grad_norm": 0.091796875, "learning_rate": 0.1, "loss": 2.1069390773773193, "step": 8676 }, { "epoch": 0.2754920634920635, "grad_norm": 0.1484375, "learning_rate": 0.1, "loss": 2.115922451019287, "step": 8678 }, { "epoch": 0.27555555555555555, "grad_norm": 0.2470703125, "learning_rate": 0.1, "loss": 2.1389780044555664, "step": 8680 }, { "epoch": 0.2756190476190476, "grad_norm": 0.1796875, "learning_rate": 0.1, "loss": 2.141141414642334, "step": 8682 }, { "epoch": 0.2756825396825397, "grad_norm": 0.125, "learning_rate": 0.1, "loss": 2.099609851837158, "step": 8684 }, { "epoch": 0.27574603174603174, "grad_norm": 0.10205078125, "learning_rate": 0.1, "loss": 2.119847059249878, "step": 8686 }, { "epoch": 0.2758095238095238, "grad_norm": 0.1083984375, "learning_rate": 0.1, "loss": 2.0786561965942383, "step": 8688 }, { "epoch": 0.2758730158730159, "grad_norm": 0.09228515625, "learning_rate": 0.1, "loss": 2.097207546234131, "step": 8690 }, { "epoch": 0.2759365079365079, "grad_norm": 0.1240234375, "learning_rate": 0.1, "loss": 2.1100316047668457, "step": 8692 }, { "epoch": 0.276, "grad_norm": 0.2314453125, "learning_rate": 0.1, "loss": 2.121699333190918, "step": 8694 }, { "epoch": 0.27606349206349207, "grad_norm": 0.359375, "learning_rate": 0.1, "loss": 2.0803420543670654, "step": 8696 }, { "epoch": 0.2761269841269841, "grad_norm": 0.0859375, "learning_rate": 0.1, "loss": 2.0993804931640625, "step": 8698 }, { "epoch": 0.2761904761904762, "grad_norm": 0.25390625, "learning_rate": 0.1, "loss": 2.095191478729248, "step": 8700 }, { "epoch": 0.27625396825396825, "grad_norm": 0.1904296875, "learning_rate": 0.1, "loss": 2.101367950439453, "step": 8702 }, { "epoch": 0.2763174603174603, "grad_norm": 0.1884765625, "learning_rate": 0.1, "loss": 2.095370054244995, "step": 8704 }, { "epoch": 0.2763809523809524, "grad_norm": 0.1728515625, "learning_rate": 0.1, "loss": 2.094785213470459, "step": 8706 }, { "epoch": 0.27644444444444444, "grad_norm": 0.05322265625, "learning_rate": 0.1, "loss": 2.103748083114624, "step": 8708 }, { "epoch": 0.2765079365079365, "grad_norm": 0.11279296875, "learning_rate": 0.1, "loss": 2.1015448570251465, "step": 8710 }, { "epoch": 0.2765714285714286, "grad_norm": 0.05712890625, "learning_rate": 0.1, "loss": 2.089344024658203, "step": 8712 }, { "epoch": 0.2766349206349206, "grad_norm": 0.055908203125, "learning_rate": 0.1, "loss": 2.0754406452178955, "step": 8714 }, { "epoch": 0.2766984126984127, "grad_norm": 0.08544921875, "learning_rate": 0.1, "loss": 2.0874297618865967, "step": 8716 }, { "epoch": 0.27676190476190476, "grad_norm": 0.09033203125, "learning_rate": 0.1, "loss": 2.0546696186065674, "step": 8718 }, { "epoch": 0.2768253968253968, "grad_norm": 0.201171875, "learning_rate": 0.1, "loss": 2.1206765174865723, "step": 8720 }, { "epoch": 0.2768888888888889, "grad_norm": 0.2216796875, "learning_rate": 0.1, "loss": 2.120166540145874, "step": 8722 }, { "epoch": 0.27695238095238095, "grad_norm": 0.2451171875, "learning_rate": 0.1, "loss": 2.082396984100342, "step": 8724 }, { "epoch": 0.277015873015873, "grad_norm": 0.1845703125, "learning_rate": 0.1, "loss": 2.1015217304229736, "step": 8726 }, { "epoch": 0.2770793650793651, "grad_norm": 0.053955078125, "learning_rate": 0.1, "loss": 2.0530738830566406, "step": 8728 }, { "epoch": 0.27714285714285714, "grad_norm": 0.2001953125, "learning_rate": 0.1, "loss": 2.0722157955169678, "step": 8730 }, { "epoch": 0.27720634920634923, "grad_norm": 0.10986328125, "learning_rate": 0.1, "loss": 2.087872266769409, "step": 8732 }, { "epoch": 0.2772698412698413, "grad_norm": 0.1025390625, "learning_rate": 0.1, "loss": 2.1183462142944336, "step": 8734 }, { "epoch": 0.2773333333333333, "grad_norm": 0.11279296875, "learning_rate": 0.1, "loss": 2.1100778579711914, "step": 8736 }, { "epoch": 0.2773968253968254, "grad_norm": 0.115234375, "learning_rate": 0.1, "loss": 2.052661895751953, "step": 8738 }, { "epoch": 0.27746031746031746, "grad_norm": 0.1826171875, "learning_rate": 0.1, "loss": 2.0922820568084717, "step": 8740 }, { "epoch": 0.2775238095238095, "grad_norm": 0.345703125, "learning_rate": 0.1, "loss": 2.100325584411621, "step": 8742 }, { "epoch": 0.2775873015873016, "grad_norm": 0.224609375, "learning_rate": 0.1, "loss": 2.070025682449341, "step": 8744 }, { "epoch": 0.27765079365079365, "grad_norm": 0.1201171875, "learning_rate": 0.1, "loss": 2.0891008377075195, "step": 8746 }, { "epoch": 0.2777142857142857, "grad_norm": 0.1640625, "learning_rate": 0.1, "loss": 2.1055970191955566, "step": 8748 }, { "epoch": 0.2777777777777778, "grad_norm": 0.19140625, "learning_rate": 0.1, "loss": 2.1428401470184326, "step": 8750 }, { "epoch": 0.27784126984126983, "grad_norm": 0.15234375, "learning_rate": 0.1, "loss": 2.075908660888672, "step": 8752 }, { "epoch": 0.27790476190476193, "grad_norm": 0.07177734375, "learning_rate": 0.1, "loss": 2.1127548217773438, "step": 8754 }, { "epoch": 0.277968253968254, "grad_norm": 0.049560546875, "learning_rate": 0.1, "loss": 2.0872695446014404, "step": 8756 }, { "epoch": 0.278031746031746, "grad_norm": 0.11181640625, "learning_rate": 0.1, "loss": 2.093517303466797, "step": 8758 }, { "epoch": 0.2780952380952381, "grad_norm": 0.11376953125, "learning_rate": 0.1, "loss": 2.101145029067993, "step": 8760 }, { "epoch": 0.27815873015873016, "grad_norm": 0.21484375, "learning_rate": 0.1, "loss": 2.0886099338531494, "step": 8762 }, { "epoch": 0.2782222222222222, "grad_norm": 0.451171875, "learning_rate": 0.1, "loss": 2.1394264698028564, "step": 8764 }, { "epoch": 0.2782857142857143, "grad_norm": 0.27734375, "learning_rate": 0.1, "loss": 2.1503655910491943, "step": 8766 }, { "epoch": 0.27834920634920635, "grad_norm": 0.1259765625, "learning_rate": 0.1, "loss": 2.146308660507202, "step": 8768 }, { "epoch": 0.2784126984126984, "grad_norm": 0.2451171875, "learning_rate": 0.1, "loss": 2.1551287174224854, "step": 8770 }, { "epoch": 0.2784761904761905, "grad_norm": 0.1728515625, "learning_rate": 0.1, "loss": 2.1128764152526855, "step": 8772 }, { "epoch": 0.27853968253968253, "grad_norm": 0.07666015625, "learning_rate": 0.1, "loss": 2.1701791286468506, "step": 8774 }, { "epoch": 0.27860317460317463, "grad_norm": 0.07568359375, "learning_rate": 0.1, "loss": 2.1299755573272705, "step": 8776 }, { "epoch": 0.2786666666666667, "grad_norm": 0.205078125, "learning_rate": 0.1, "loss": 2.1372087001800537, "step": 8778 }, { "epoch": 0.2787301587301587, "grad_norm": 0.0654296875, "learning_rate": 0.1, "loss": 2.153611660003662, "step": 8780 }, { "epoch": 0.2787936507936508, "grad_norm": 0.11376953125, "learning_rate": 0.1, "loss": 2.1312272548675537, "step": 8782 }, { "epoch": 0.27885714285714286, "grad_norm": 0.177734375, "learning_rate": 0.1, "loss": 2.1453471183776855, "step": 8784 }, { "epoch": 0.2789206349206349, "grad_norm": 0.05908203125, "learning_rate": 0.1, "loss": 2.1805078983306885, "step": 8786 }, { "epoch": 0.278984126984127, "grad_norm": 0.171875, "learning_rate": 0.1, "loss": 2.1739680767059326, "step": 8788 }, { "epoch": 0.27904761904761904, "grad_norm": 0.314453125, "learning_rate": 0.1, "loss": 2.1261680126190186, "step": 8790 }, { "epoch": 0.2791111111111111, "grad_norm": 0.271484375, "learning_rate": 0.1, "loss": 2.1204230785369873, "step": 8792 }, { "epoch": 0.2791746031746032, "grad_norm": 0.109375, "learning_rate": 0.1, "loss": 2.159334421157837, "step": 8794 }, { "epoch": 0.27923809523809523, "grad_norm": 0.08056640625, "learning_rate": 0.1, "loss": 2.1768875122070312, "step": 8796 }, { "epoch": 0.27930158730158733, "grad_norm": 0.0712890625, "learning_rate": 0.1, "loss": 2.1811749935150146, "step": 8798 }, { "epoch": 0.27936507936507937, "grad_norm": 0.09716796875, "learning_rate": 0.1, "loss": 2.1662099361419678, "step": 8800 }, { "epoch": 0.2794285714285714, "grad_norm": 0.22265625, "learning_rate": 0.1, "loss": 2.185159921646118, "step": 8802 }, { "epoch": 0.2794920634920635, "grad_norm": 0.10595703125, "learning_rate": 0.1, "loss": 2.167827606201172, "step": 8804 }, { "epoch": 0.27955555555555556, "grad_norm": 0.197265625, "learning_rate": 0.1, "loss": 2.175893783569336, "step": 8806 }, { "epoch": 0.2796190476190476, "grad_norm": 0.275390625, "learning_rate": 0.1, "loss": 2.1560049057006836, "step": 8808 }, { "epoch": 0.2796825396825397, "grad_norm": 0.1650390625, "learning_rate": 0.1, "loss": 2.166337251663208, "step": 8810 }, { "epoch": 0.27974603174603174, "grad_norm": 0.08642578125, "learning_rate": 0.1, "loss": 2.17671275138855, "step": 8812 }, { "epoch": 0.2798095238095238, "grad_norm": 0.10205078125, "learning_rate": 0.1, "loss": 2.1803746223449707, "step": 8814 }, { "epoch": 0.2798730158730159, "grad_norm": 0.193359375, "learning_rate": 0.1, "loss": 2.1711649894714355, "step": 8816 }, { "epoch": 0.27993650793650793, "grad_norm": 0.1884765625, "learning_rate": 0.1, "loss": 2.1615333557128906, "step": 8818 }, { "epoch": 0.28, "grad_norm": 0.10498046875, "learning_rate": 0.1, "loss": 2.1898813247680664, "step": 8820 }, { "epoch": 0.28006349206349207, "grad_norm": 0.2041015625, "learning_rate": 0.1, "loss": 2.1542539596557617, "step": 8822 }, { "epoch": 0.2801269841269841, "grad_norm": 0.072265625, "learning_rate": 0.1, "loss": 2.212780475616455, "step": 8824 }, { "epoch": 0.2801904761904762, "grad_norm": 0.08642578125, "learning_rate": 0.1, "loss": 2.182939052581787, "step": 8826 }, { "epoch": 0.28025396825396826, "grad_norm": 0.1982421875, "learning_rate": 0.1, "loss": 2.15337872505188, "step": 8828 }, { "epoch": 0.2803174603174603, "grad_norm": 0.205078125, "learning_rate": 0.1, "loss": 2.198306083679199, "step": 8830 }, { "epoch": 0.2803809523809524, "grad_norm": 0.404296875, "learning_rate": 0.1, "loss": 2.163369655609131, "step": 8832 }, { "epoch": 0.28044444444444444, "grad_norm": 0.1650390625, "learning_rate": 0.1, "loss": 2.183056592941284, "step": 8834 }, { "epoch": 0.2805079365079365, "grad_norm": 0.087890625, "learning_rate": 0.1, "loss": 2.1743345260620117, "step": 8836 }, { "epoch": 0.2805714285714286, "grad_norm": 0.06884765625, "learning_rate": 0.1, "loss": 2.173196792602539, "step": 8838 }, { "epoch": 0.2806349206349206, "grad_norm": 0.07080078125, "learning_rate": 0.1, "loss": 2.1779937744140625, "step": 8840 }, { "epoch": 0.2806984126984127, "grad_norm": 0.1171875, "learning_rate": 0.1, "loss": 2.158475399017334, "step": 8842 }, { "epoch": 0.28076190476190477, "grad_norm": 0.271484375, "learning_rate": 0.1, "loss": 2.170039415359497, "step": 8844 }, { "epoch": 0.2808253968253968, "grad_norm": 0.1396484375, "learning_rate": 0.1, "loss": 2.1667003631591797, "step": 8846 }, { "epoch": 0.2808888888888889, "grad_norm": 0.07958984375, "learning_rate": 0.1, "loss": 2.1522533893585205, "step": 8848 }, { "epoch": 0.28095238095238095, "grad_norm": 0.048828125, "learning_rate": 0.1, "loss": 2.1537110805511475, "step": 8850 }, { "epoch": 0.281015873015873, "grad_norm": 0.0537109375, "learning_rate": 0.1, "loss": 2.160660982131958, "step": 8852 }, { "epoch": 0.2810793650793651, "grad_norm": 0.134765625, "learning_rate": 0.1, "loss": 2.196817636489868, "step": 8854 }, { "epoch": 0.28114285714285714, "grad_norm": 0.27734375, "learning_rate": 0.1, "loss": 2.1816718578338623, "step": 8856 }, { "epoch": 0.2812063492063492, "grad_norm": 0.25, "learning_rate": 0.1, "loss": 2.194470167160034, "step": 8858 }, { "epoch": 0.2812698412698413, "grad_norm": 0.10791015625, "learning_rate": 0.1, "loss": 2.183758497238159, "step": 8860 }, { "epoch": 0.2813333333333333, "grad_norm": 0.2216796875, "learning_rate": 0.1, "loss": 2.2079179286956787, "step": 8862 }, { "epoch": 0.2813968253968254, "grad_norm": 0.1328125, "learning_rate": 0.1, "loss": 2.186175584793091, "step": 8864 }, { "epoch": 0.28146031746031747, "grad_norm": 0.2314453125, "learning_rate": 0.1, "loss": 2.1994242668151855, "step": 8866 }, { "epoch": 0.2815238095238095, "grad_norm": 0.2431640625, "learning_rate": 0.1, "loss": 2.1821951866149902, "step": 8868 }, { "epoch": 0.2815873015873016, "grad_norm": 0.0849609375, "learning_rate": 0.1, "loss": 2.1879494190216064, "step": 8870 }, { "epoch": 0.28165079365079365, "grad_norm": 0.087890625, "learning_rate": 0.1, "loss": 2.186424493789673, "step": 8872 }, { "epoch": 0.2817142857142857, "grad_norm": 0.12109375, "learning_rate": 0.1, "loss": 2.1793198585510254, "step": 8874 }, { "epoch": 0.2817777777777778, "grad_norm": 0.265625, "learning_rate": 0.1, "loss": 2.1502716541290283, "step": 8876 }, { "epoch": 0.28184126984126984, "grad_norm": 0.294921875, "learning_rate": 0.1, "loss": 2.161484479904175, "step": 8878 }, { "epoch": 0.2819047619047619, "grad_norm": 0.056884765625, "learning_rate": 0.1, "loss": 2.195265293121338, "step": 8880 }, { "epoch": 0.281968253968254, "grad_norm": 0.044677734375, "learning_rate": 0.1, "loss": 2.174272060394287, "step": 8882 }, { "epoch": 0.282031746031746, "grad_norm": 0.06884765625, "learning_rate": 0.1, "loss": 2.1788079738616943, "step": 8884 }, { "epoch": 0.2820952380952381, "grad_norm": 0.234375, "learning_rate": 0.1, "loss": 2.1915924549102783, "step": 8886 }, { "epoch": 0.28215873015873016, "grad_norm": 0.353515625, "learning_rate": 0.1, "loss": 2.17026686668396, "step": 8888 }, { "epoch": 0.2822222222222222, "grad_norm": 0.06591796875, "learning_rate": 0.1, "loss": 2.2053048610687256, "step": 8890 }, { "epoch": 0.2822857142857143, "grad_norm": 0.17578125, "learning_rate": 0.1, "loss": 2.187246322631836, "step": 8892 }, { "epoch": 0.28234920634920635, "grad_norm": 0.26171875, "learning_rate": 0.1, "loss": 2.213986396789551, "step": 8894 }, { "epoch": 0.2824126984126984, "grad_norm": 0.21484375, "learning_rate": 0.1, "loss": 2.183722972869873, "step": 8896 }, { "epoch": 0.2824761904761905, "grad_norm": 0.12890625, "learning_rate": 0.1, "loss": 2.1777892112731934, "step": 8898 }, { "epoch": 0.28253968253968254, "grad_norm": 0.18359375, "learning_rate": 0.1, "loss": 2.1808652877807617, "step": 8900 }, { "epoch": 0.2826031746031746, "grad_norm": 0.134765625, "learning_rate": 0.1, "loss": 2.1496033668518066, "step": 8902 }, { "epoch": 0.2826666666666667, "grad_norm": 0.1181640625, "learning_rate": 0.1, "loss": 2.199847459793091, "step": 8904 }, { "epoch": 0.2827301587301587, "grad_norm": 0.09228515625, "learning_rate": 0.1, "loss": 2.182666063308716, "step": 8906 }, { "epoch": 0.2827936507936508, "grad_norm": 0.058837890625, "learning_rate": 0.1, "loss": 2.1668598651885986, "step": 8908 }, { "epoch": 0.28285714285714286, "grad_norm": 0.138671875, "learning_rate": 0.1, "loss": 2.17806077003479, "step": 8910 }, { "epoch": 0.2829206349206349, "grad_norm": 0.10205078125, "learning_rate": 0.1, "loss": 2.1802000999450684, "step": 8912 }, { "epoch": 0.282984126984127, "grad_norm": 0.150390625, "learning_rate": 0.1, "loss": 2.175011396408081, "step": 8914 }, { "epoch": 0.28304761904761905, "grad_norm": 0.12060546875, "learning_rate": 0.1, "loss": 2.2038304805755615, "step": 8916 }, { "epoch": 0.2831111111111111, "grad_norm": 0.1298828125, "learning_rate": 0.1, "loss": 2.167120933532715, "step": 8918 }, { "epoch": 0.2831746031746032, "grad_norm": 0.251953125, "learning_rate": 0.1, "loss": 2.170675039291382, "step": 8920 }, { "epoch": 0.28323809523809523, "grad_norm": 0.294921875, "learning_rate": 0.1, "loss": 2.1791868209838867, "step": 8922 }, { "epoch": 0.2833015873015873, "grad_norm": 0.265625, "learning_rate": 0.1, "loss": 2.1834564208984375, "step": 8924 }, { "epoch": 0.2833650793650794, "grad_norm": 0.46484375, "learning_rate": 0.1, "loss": 2.16859769821167, "step": 8926 }, { "epoch": 0.2834285714285714, "grad_norm": 0.1123046875, "learning_rate": 0.1, "loss": 2.1890223026275635, "step": 8928 }, { "epoch": 0.2834920634920635, "grad_norm": 0.09765625, "learning_rate": 0.1, "loss": 2.1806206703186035, "step": 8930 }, { "epoch": 0.28355555555555556, "grad_norm": 0.08251953125, "learning_rate": 0.1, "loss": 2.173555374145508, "step": 8932 }, { "epoch": 0.2836190476190476, "grad_norm": 0.107421875, "learning_rate": 0.1, "loss": 2.18434739112854, "step": 8934 }, { "epoch": 0.2836825396825397, "grad_norm": 0.1435546875, "learning_rate": 0.1, "loss": 2.2081351280212402, "step": 8936 }, { "epoch": 0.28374603174603175, "grad_norm": 0.1318359375, "learning_rate": 0.1, "loss": 2.1782050132751465, "step": 8938 }, { "epoch": 0.2838095238095238, "grad_norm": 0.07470703125, "learning_rate": 0.1, "loss": 2.1914258003234863, "step": 8940 }, { "epoch": 0.2838730158730159, "grad_norm": 0.1220703125, "learning_rate": 0.1, "loss": 2.186129331588745, "step": 8942 }, { "epoch": 0.28393650793650793, "grad_norm": 0.05908203125, "learning_rate": 0.1, "loss": 2.190122365951538, "step": 8944 }, { "epoch": 0.284, "grad_norm": 0.04833984375, "learning_rate": 0.1, "loss": 2.1819400787353516, "step": 8946 }, { "epoch": 0.2840634920634921, "grad_norm": 0.08837890625, "learning_rate": 0.1, "loss": 2.1656758785247803, "step": 8948 }, { "epoch": 0.2841269841269841, "grad_norm": 0.064453125, "learning_rate": 0.1, "loss": 2.181184768676758, "step": 8950 }, { "epoch": 0.2841904761904762, "grad_norm": 0.255859375, "learning_rate": 0.1, "loss": 2.1798245906829834, "step": 8952 }, { "epoch": 0.28425396825396826, "grad_norm": 0.255859375, "learning_rate": 0.1, "loss": 2.2067718505859375, "step": 8954 }, { "epoch": 0.2843174603174603, "grad_norm": 0.1474609375, "learning_rate": 0.1, "loss": 2.208606481552124, "step": 8956 }, { "epoch": 0.2843809523809524, "grad_norm": 0.37109375, "learning_rate": 0.1, "loss": 2.185300350189209, "step": 8958 }, { "epoch": 0.28444444444444444, "grad_norm": 0.08056640625, "learning_rate": 0.1, "loss": 2.204303026199341, "step": 8960 }, { "epoch": 0.2845079365079365, "grad_norm": 0.189453125, "learning_rate": 0.1, "loss": 2.2258753776550293, "step": 8962 }, { "epoch": 0.2845714285714286, "grad_norm": 0.12890625, "learning_rate": 0.1, "loss": 2.1987195014953613, "step": 8964 }, { "epoch": 0.28463492063492063, "grad_norm": 0.140625, "learning_rate": 0.1, "loss": 2.207448959350586, "step": 8966 }, { "epoch": 0.2846984126984127, "grad_norm": 0.1318359375, "learning_rate": 0.1, "loss": 2.163579225540161, "step": 8968 }, { "epoch": 0.28476190476190477, "grad_norm": 0.0693359375, "learning_rate": 0.1, "loss": 2.1816811561584473, "step": 8970 }, { "epoch": 0.2848253968253968, "grad_norm": 0.1591796875, "learning_rate": 0.1, "loss": 2.195586681365967, "step": 8972 }, { "epoch": 0.2848888888888889, "grad_norm": 0.09912109375, "learning_rate": 0.1, "loss": 2.1725900173187256, "step": 8974 }, { "epoch": 0.28495238095238096, "grad_norm": 0.1806640625, "learning_rate": 0.1, "loss": 2.184699058532715, "step": 8976 }, { "epoch": 0.285015873015873, "grad_norm": 0.13671875, "learning_rate": 0.1, "loss": 2.1791391372680664, "step": 8978 }, { "epoch": 0.2850793650793651, "grad_norm": 0.296875, "learning_rate": 0.1, "loss": 2.201615571975708, "step": 8980 }, { "epoch": 0.28514285714285714, "grad_norm": 0.20703125, "learning_rate": 0.1, "loss": 2.181730031967163, "step": 8982 }, { "epoch": 0.2852063492063492, "grad_norm": 0.171875, "learning_rate": 0.1, "loss": 2.2151029109954834, "step": 8984 }, { "epoch": 0.2852698412698413, "grad_norm": 0.2373046875, "learning_rate": 0.1, "loss": 2.194033622741699, "step": 8986 }, { "epoch": 0.2853333333333333, "grad_norm": 0.2158203125, "learning_rate": 0.1, "loss": 2.1902871131896973, "step": 8988 }, { "epoch": 0.28539682539682537, "grad_norm": 0.08154296875, "learning_rate": 0.1, "loss": 2.15938401222229, "step": 8990 }, { "epoch": 0.28546031746031747, "grad_norm": 0.115234375, "learning_rate": 0.1, "loss": 2.1988768577575684, "step": 8992 }, { "epoch": 0.2855238095238095, "grad_norm": 0.0732421875, "learning_rate": 0.1, "loss": 2.197119951248169, "step": 8994 }, { "epoch": 0.2855873015873016, "grad_norm": 0.1083984375, "learning_rate": 0.1, "loss": 2.2141082286834717, "step": 8996 }, { "epoch": 0.28565079365079366, "grad_norm": 0.11376953125, "learning_rate": 0.1, "loss": 2.227226734161377, "step": 8998 }, { "epoch": 0.2857142857142857, "grad_norm": 0.236328125, "learning_rate": 0.1, "loss": 2.2329914569854736, "step": 9000 }, { "epoch": 0.2857777777777778, "grad_norm": 0.2197265625, "learning_rate": 0.1, "loss": 2.1901376247406006, "step": 9002 }, { "epoch": 0.28584126984126984, "grad_norm": 0.35546875, "learning_rate": 0.1, "loss": 2.199721336364746, "step": 9004 }, { "epoch": 0.2859047619047619, "grad_norm": 0.06640625, "learning_rate": 0.1, "loss": 2.209036350250244, "step": 9006 }, { "epoch": 0.285968253968254, "grad_norm": 0.162109375, "learning_rate": 0.1, "loss": 2.230076551437378, "step": 9008 }, { "epoch": 0.286031746031746, "grad_norm": 0.056884765625, "learning_rate": 0.1, "loss": 2.1973347663879395, "step": 9010 }, { "epoch": 0.28609523809523807, "grad_norm": 0.2197265625, "learning_rate": 0.1, "loss": 2.1874778270721436, "step": 9012 }, { "epoch": 0.28615873015873017, "grad_norm": 0.353515625, "learning_rate": 0.1, "loss": 2.202613115310669, "step": 9014 }, { "epoch": 0.2862222222222222, "grad_norm": 0.103515625, "learning_rate": 0.1, "loss": 2.212963342666626, "step": 9016 }, { "epoch": 0.2862857142857143, "grad_norm": 0.064453125, "learning_rate": 0.1, "loss": 2.226928472518921, "step": 9018 }, { "epoch": 0.28634920634920635, "grad_norm": 0.12890625, "learning_rate": 0.1, "loss": 2.1980597972869873, "step": 9020 }, { "epoch": 0.2864126984126984, "grad_norm": 0.1416015625, "learning_rate": 0.1, "loss": 2.227907180786133, "step": 9022 }, { "epoch": 0.2864761904761905, "grad_norm": 0.1552734375, "learning_rate": 0.1, "loss": 2.2173283100128174, "step": 9024 }, { "epoch": 0.28653968253968254, "grad_norm": 0.06884765625, "learning_rate": 0.1, "loss": 2.1922576427459717, "step": 9026 }, { "epoch": 0.2866031746031746, "grad_norm": 0.1875, "learning_rate": 0.1, "loss": 2.210458755493164, "step": 9028 }, { "epoch": 0.2866666666666667, "grad_norm": 0.09814453125, "learning_rate": 0.1, "loss": 2.225430965423584, "step": 9030 }, { "epoch": 0.2867301587301587, "grad_norm": 0.07421875, "learning_rate": 0.1, "loss": 2.2024455070495605, "step": 9032 }, { "epoch": 0.28679365079365077, "grad_norm": 0.2578125, "learning_rate": 0.1, "loss": 2.2061767578125, "step": 9034 }, { "epoch": 0.28685714285714287, "grad_norm": 0.15625, "learning_rate": 0.1, "loss": 2.213874101638794, "step": 9036 }, { "epoch": 0.2869206349206349, "grad_norm": 0.146484375, "learning_rate": 0.1, "loss": 2.1802902221679688, "step": 9038 }, { "epoch": 0.286984126984127, "grad_norm": 0.2080078125, "learning_rate": 0.1, "loss": 2.2048802375793457, "step": 9040 }, { "epoch": 0.28704761904761905, "grad_norm": 0.29296875, "learning_rate": 0.1, "loss": 2.192049026489258, "step": 9042 }, { "epoch": 0.2871111111111111, "grad_norm": 0.12890625, "learning_rate": 0.1, "loss": 2.180990695953369, "step": 9044 }, { "epoch": 0.2871746031746032, "grad_norm": 0.0859375, "learning_rate": 0.1, "loss": 2.188490152359009, "step": 9046 }, { "epoch": 0.28723809523809524, "grad_norm": 0.1513671875, "learning_rate": 0.1, "loss": 2.1854870319366455, "step": 9048 }, { "epoch": 0.2873015873015873, "grad_norm": 0.330078125, "learning_rate": 0.1, "loss": 2.212791919708252, "step": 9050 }, { "epoch": 0.2873650793650794, "grad_norm": 0.140625, "learning_rate": 0.1, "loss": 2.2178494930267334, "step": 9052 }, { "epoch": 0.2874285714285714, "grad_norm": 0.060791015625, "learning_rate": 0.1, "loss": 2.23032808303833, "step": 9054 }, { "epoch": 0.28749206349206347, "grad_norm": 0.134765625, "learning_rate": 0.1, "loss": 2.2120113372802734, "step": 9056 }, { "epoch": 0.28755555555555556, "grad_norm": 0.1708984375, "learning_rate": 0.1, "loss": 2.183887243270874, "step": 9058 }, { "epoch": 0.2876190476190476, "grad_norm": 0.1142578125, "learning_rate": 0.1, "loss": 2.229914665222168, "step": 9060 }, { "epoch": 0.2876825396825397, "grad_norm": 0.23046875, "learning_rate": 0.1, "loss": 2.2208428382873535, "step": 9062 }, { "epoch": 0.28774603174603175, "grad_norm": 0.39453125, "learning_rate": 0.1, "loss": 2.225252866744995, "step": 9064 }, { "epoch": 0.2878095238095238, "grad_norm": 0.2041015625, "learning_rate": 0.1, "loss": 2.221372365951538, "step": 9066 }, { "epoch": 0.2878730158730159, "grad_norm": 0.0732421875, "learning_rate": 0.1, "loss": 2.2307841777801514, "step": 9068 }, { "epoch": 0.28793650793650793, "grad_norm": 0.1005859375, "learning_rate": 0.1, "loss": 2.232811689376831, "step": 9070 }, { "epoch": 0.288, "grad_norm": 0.111328125, "learning_rate": 0.1, "loss": 2.218158006668091, "step": 9072 }, { "epoch": 0.2880634920634921, "grad_norm": 0.0751953125, "learning_rate": 0.1, "loss": 2.1936721801757812, "step": 9074 }, { "epoch": 0.2881269841269841, "grad_norm": 0.12451171875, "learning_rate": 0.1, "loss": 2.2040860652923584, "step": 9076 }, { "epoch": 0.28819047619047616, "grad_norm": 0.189453125, "learning_rate": 0.1, "loss": 2.239894390106201, "step": 9078 }, { "epoch": 0.28825396825396826, "grad_norm": 0.1474609375, "learning_rate": 0.1, "loss": 2.2085509300231934, "step": 9080 }, { "epoch": 0.2883174603174603, "grad_norm": 0.4453125, "learning_rate": 0.1, "loss": 2.2247421741485596, "step": 9082 }, { "epoch": 0.2883809523809524, "grad_norm": 0.1552734375, "learning_rate": 0.1, "loss": 2.241921901702881, "step": 9084 }, { "epoch": 0.28844444444444445, "grad_norm": 0.07080078125, "learning_rate": 0.1, "loss": 2.2192020416259766, "step": 9086 }, { "epoch": 0.2885079365079365, "grad_norm": 0.07861328125, "learning_rate": 0.1, "loss": 2.2454845905303955, "step": 9088 }, { "epoch": 0.2885714285714286, "grad_norm": 0.12158203125, "learning_rate": 0.1, "loss": 2.232693910598755, "step": 9090 }, { "epoch": 0.28863492063492063, "grad_norm": 0.2109375, "learning_rate": 0.1, "loss": 2.2310402393341064, "step": 9092 }, { "epoch": 0.2886984126984127, "grad_norm": 0.10986328125, "learning_rate": 0.1, "loss": 2.228677272796631, "step": 9094 }, { "epoch": 0.2887619047619048, "grad_norm": 0.1767578125, "learning_rate": 0.1, "loss": 2.2222514152526855, "step": 9096 }, { "epoch": 0.2888253968253968, "grad_norm": 0.0732421875, "learning_rate": 0.1, "loss": 2.2693519592285156, "step": 9098 }, { "epoch": 0.28888888888888886, "grad_norm": 0.244140625, "learning_rate": 0.1, "loss": 2.2257843017578125, "step": 9100 }, { "epoch": 0.28895238095238096, "grad_norm": 0.1826171875, "learning_rate": 0.1, "loss": 2.25165057182312, "step": 9102 }, { "epoch": 0.289015873015873, "grad_norm": 0.06982421875, "learning_rate": 0.1, "loss": 2.2450337409973145, "step": 9104 }, { "epoch": 0.2890793650793651, "grad_norm": 0.052001953125, "learning_rate": 0.1, "loss": 2.219763994216919, "step": 9106 }, { "epoch": 0.28914285714285715, "grad_norm": 0.154296875, "learning_rate": 0.1, "loss": 2.2455620765686035, "step": 9108 }, { "epoch": 0.2892063492063492, "grad_norm": 0.265625, "learning_rate": 0.1, "loss": 2.233729124069214, "step": 9110 }, { "epoch": 0.2892698412698413, "grad_norm": 0.13671875, "learning_rate": 0.1, "loss": 2.2443225383758545, "step": 9112 }, { "epoch": 0.28933333333333333, "grad_norm": 0.23046875, "learning_rate": 0.1, "loss": 2.2144813537597656, "step": 9114 }, { "epoch": 0.2893968253968254, "grad_norm": 0.177734375, "learning_rate": 0.1, "loss": 2.2447879314422607, "step": 9116 }, { "epoch": 0.2894603174603175, "grad_norm": 0.2197265625, "learning_rate": 0.1, "loss": 2.2340450286865234, "step": 9118 }, { "epoch": 0.2895238095238095, "grad_norm": 0.51171875, "learning_rate": 0.1, "loss": 2.269477605819702, "step": 9120 }, { "epoch": 0.28958730158730156, "grad_norm": 0.0712890625, "learning_rate": 0.1, "loss": 2.2169039249420166, "step": 9122 }, { "epoch": 0.28965079365079366, "grad_norm": 0.09423828125, "learning_rate": 0.1, "loss": 2.2260348796844482, "step": 9124 }, { "epoch": 0.2897142857142857, "grad_norm": 0.10791015625, "learning_rate": 0.1, "loss": 2.245819568634033, "step": 9126 }, { "epoch": 0.2897777777777778, "grad_norm": 0.15234375, "learning_rate": 0.1, "loss": 2.235907793045044, "step": 9128 }, { "epoch": 0.28984126984126984, "grad_norm": 0.056396484375, "learning_rate": 0.1, "loss": 2.2330384254455566, "step": 9130 }, { "epoch": 0.2899047619047619, "grad_norm": 0.1953125, "learning_rate": 0.1, "loss": 2.227299451828003, "step": 9132 }, { "epoch": 0.289968253968254, "grad_norm": 0.23046875, "learning_rate": 0.1, "loss": 2.2406697273254395, "step": 9134 }, { "epoch": 0.29003174603174603, "grad_norm": 0.0615234375, "learning_rate": 0.1, "loss": 2.2712175846099854, "step": 9136 }, { "epoch": 0.2900952380952381, "grad_norm": 0.2373046875, "learning_rate": 0.1, "loss": 2.23835825920105, "step": 9138 }, { "epoch": 0.29015873015873017, "grad_norm": 0.2373046875, "learning_rate": 0.1, "loss": 2.2234349250793457, "step": 9140 }, { "epoch": 0.2902222222222222, "grad_norm": 0.1533203125, "learning_rate": 0.1, "loss": 2.2278506755828857, "step": 9142 }, { "epoch": 0.29028571428571426, "grad_norm": 0.1640625, "learning_rate": 0.1, "loss": 2.231804609298706, "step": 9144 }, { "epoch": 0.29034920634920636, "grad_norm": 0.080078125, "learning_rate": 0.1, "loss": 2.2411413192749023, "step": 9146 }, { "epoch": 0.2904126984126984, "grad_norm": 0.09228515625, "learning_rate": 0.1, "loss": 2.2149741649627686, "step": 9148 }, { "epoch": 0.2904761904761905, "grad_norm": 0.0791015625, "learning_rate": 0.1, "loss": 2.251361846923828, "step": 9150 }, { "epoch": 0.29053968253968254, "grad_norm": 0.158203125, "learning_rate": 0.1, "loss": 2.2122573852539062, "step": 9152 }, { "epoch": 0.2906031746031746, "grad_norm": 0.2158203125, "learning_rate": 0.1, "loss": 2.240163564682007, "step": 9154 }, { "epoch": 0.2906666666666667, "grad_norm": 0.1494140625, "learning_rate": 0.1, "loss": 2.224710464477539, "step": 9156 }, { "epoch": 0.2907301587301587, "grad_norm": 0.14453125, "learning_rate": 0.1, "loss": 2.2182130813598633, "step": 9158 }, { "epoch": 0.29079365079365077, "grad_norm": 0.2138671875, "learning_rate": 0.1, "loss": 2.2215051651000977, "step": 9160 }, { "epoch": 0.29085714285714287, "grad_norm": 0.1455078125, "learning_rate": 0.1, "loss": 2.25689435005188, "step": 9162 }, { "epoch": 0.2909206349206349, "grad_norm": 0.234375, "learning_rate": 0.1, "loss": 2.2734827995300293, "step": 9164 }, { "epoch": 0.290984126984127, "grad_norm": 0.1259765625, "learning_rate": 0.1, "loss": 2.219792604446411, "step": 9166 }, { "epoch": 0.29104761904761905, "grad_norm": 0.32421875, "learning_rate": 0.1, "loss": 2.2277700901031494, "step": 9168 }, { "epoch": 0.2911111111111111, "grad_norm": 0.19921875, "learning_rate": 0.1, "loss": 2.220167398452759, "step": 9170 }, { "epoch": 0.2911746031746032, "grad_norm": 0.10302734375, "learning_rate": 0.1, "loss": 2.2085940837860107, "step": 9172 }, { "epoch": 0.29123809523809524, "grad_norm": 0.232421875, "learning_rate": 0.1, "loss": 2.2482547760009766, "step": 9174 }, { "epoch": 0.2913015873015873, "grad_norm": 0.23046875, "learning_rate": 0.1, "loss": 2.229984998703003, "step": 9176 }, { "epoch": 0.2913650793650794, "grad_norm": 0.052001953125, "learning_rate": 0.1, "loss": 2.2151546478271484, "step": 9178 }, { "epoch": 0.2914285714285714, "grad_norm": 0.1484375, "learning_rate": 0.1, "loss": 2.2201168537139893, "step": 9180 }, { "epoch": 0.29149206349206347, "grad_norm": 0.087890625, "learning_rate": 0.1, "loss": 2.2162840366363525, "step": 9182 }, { "epoch": 0.29155555555555557, "grad_norm": 0.1279296875, "learning_rate": 0.1, "loss": 2.256864070892334, "step": 9184 }, { "epoch": 0.2916190476190476, "grad_norm": 0.06689453125, "learning_rate": 0.1, "loss": 2.2527568340301514, "step": 9186 }, { "epoch": 0.2916825396825397, "grad_norm": 0.1005859375, "learning_rate": 0.1, "loss": 2.2284317016601562, "step": 9188 }, { "epoch": 0.29174603174603175, "grad_norm": 0.060546875, "learning_rate": 0.1, "loss": 2.239004611968994, "step": 9190 }, { "epoch": 0.2918095238095238, "grad_norm": 0.059814453125, "learning_rate": 0.1, "loss": 2.2094016075134277, "step": 9192 }, { "epoch": 0.2918730158730159, "grad_norm": 0.09912109375, "learning_rate": 0.1, "loss": 2.2316343784332275, "step": 9194 }, { "epoch": 0.29193650793650794, "grad_norm": 0.4453125, "learning_rate": 0.1, "loss": 2.2524912357330322, "step": 9196 }, { "epoch": 0.292, "grad_norm": 0.275390625, "learning_rate": 0.1, "loss": 2.252697467803955, "step": 9198 }, { "epoch": 0.2920634920634921, "grad_norm": 0.29296875, "learning_rate": 0.1, "loss": 2.2318739891052246, "step": 9200 }, { "epoch": 0.2921269841269841, "grad_norm": 0.1669921875, "learning_rate": 0.1, "loss": 2.2573587894439697, "step": 9202 }, { "epoch": 0.29219047619047617, "grad_norm": 0.0791015625, "learning_rate": 0.1, "loss": 2.2364437580108643, "step": 9204 }, { "epoch": 0.29225396825396827, "grad_norm": 0.043212890625, "learning_rate": 0.1, "loss": 2.2276108264923096, "step": 9206 }, { "epoch": 0.2923174603174603, "grad_norm": 0.076171875, "learning_rate": 0.1, "loss": 2.2238574028015137, "step": 9208 }, { "epoch": 0.2923809523809524, "grad_norm": 0.09130859375, "learning_rate": 0.1, "loss": 2.2267441749572754, "step": 9210 }, { "epoch": 0.29244444444444445, "grad_norm": 0.125, "learning_rate": 0.1, "loss": 2.2214207649230957, "step": 9212 }, { "epoch": 0.2925079365079365, "grad_norm": 0.08251953125, "learning_rate": 0.1, "loss": 2.2528092861175537, "step": 9214 }, { "epoch": 0.2925714285714286, "grad_norm": 0.1640625, "learning_rate": 0.1, "loss": 2.2266154289245605, "step": 9216 }, { "epoch": 0.29263492063492064, "grad_norm": 0.41015625, "learning_rate": 0.1, "loss": 2.2517874240875244, "step": 9218 }, { "epoch": 0.2926984126984127, "grad_norm": 0.2470703125, "learning_rate": 0.1, "loss": 2.213244915008545, "step": 9220 }, { "epoch": 0.2927619047619048, "grad_norm": 0.279296875, "learning_rate": 0.1, "loss": 2.2647016048431396, "step": 9222 }, { "epoch": 0.2928253968253968, "grad_norm": 0.2392578125, "learning_rate": 0.1, "loss": 2.265657901763916, "step": 9224 }, { "epoch": 0.29288888888888887, "grad_norm": 0.06689453125, "learning_rate": 0.1, "loss": 2.2572193145751953, "step": 9226 }, { "epoch": 0.29295238095238096, "grad_norm": 0.0537109375, "learning_rate": 0.1, "loss": 2.2142860889434814, "step": 9228 }, { "epoch": 0.293015873015873, "grad_norm": 0.06689453125, "learning_rate": 0.1, "loss": 2.2253944873809814, "step": 9230 }, { "epoch": 0.2930793650793651, "grad_norm": 0.26171875, "learning_rate": 0.1, "loss": 2.218151807785034, "step": 9232 }, { "epoch": 0.29314285714285715, "grad_norm": 0.287109375, "learning_rate": 0.1, "loss": 2.264284133911133, "step": 9234 }, { "epoch": 0.2932063492063492, "grad_norm": 0.0849609375, "learning_rate": 0.1, "loss": 2.225724220275879, "step": 9236 }, { "epoch": 0.2932698412698413, "grad_norm": 0.08544921875, "learning_rate": 0.1, "loss": 2.2374050617218018, "step": 9238 }, { "epoch": 0.29333333333333333, "grad_norm": 0.0634765625, "learning_rate": 0.1, "loss": 2.2353591918945312, "step": 9240 }, { "epoch": 0.2933968253968254, "grad_norm": 0.06396484375, "learning_rate": 0.1, "loss": 2.2092361450195312, "step": 9242 }, { "epoch": 0.2934603174603175, "grad_norm": 0.07373046875, "learning_rate": 0.1, "loss": 2.220526695251465, "step": 9244 }, { "epoch": 0.2935238095238095, "grad_norm": 0.18359375, "learning_rate": 0.1, "loss": 2.25146222114563, "step": 9246 }, { "epoch": 0.29358730158730156, "grad_norm": 0.2109375, "learning_rate": 0.1, "loss": 2.262054443359375, "step": 9248 }, { "epoch": 0.29365079365079366, "grad_norm": 0.40234375, "learning_rate": 0.1, "loss": 2.2033004760742188, "step": 9250 }, { "epoch": 0.2937142857142857, "grad_norm": 0.09716796875, "learning_rate": 0.1, "loss": 2.262868642807007, "step": 9252 }, { "epoch": 0.2937777777777778, "grad_norm": 0.06298828125, "learning_rate": 0.1, "loss": 2.1713268756866455, "step": 9254 }, { "epoch": 0.29384126984126985, "grad_norm": 0.05419921875, "learning_rate": 0.1, "loss": 2.2258338928222656, "step": 9256 }, { "epoch": 0.2939047619047619, "grad_norm": 0.2021484375, "learning_rate": 0.1, "loss": 2.2122890949249268, "step": 9258 }, { "epoch": 0.293968253968254, "grad_norm": 0.23046875, "learning_rate": 0.1, "loss": 2.2092833518981934, "step": 9260 }, { "epoch": 0.29403174603174603, "grad_norm": 0.0927734375, "learning_rate": 0.1, "loss": 2.214366912841797, "step": 9262 }, { "epoch": 0.2940952380952381, "grad_norm": 0.08056640625, "learning_rate": 0.1, "loss": 2.2345077991485596, "step": 9264 }, { "epoch": 0.2941587301587302, "grad_norm": 0.06103515625, "learning_rate": 0.1, "loss": 2.2209224700927734, "step": 9266 }, { "epoch": 0.2942222222222222, "grad_norm": 0.203125, "learning_rate": 0.1, "loss": 2.2098217010498047, "step": 9268 }, { "epoch": 0.29428571428571426, "grad_norm": 0.263671875, "learning_rate": 0.1, "loss": 2.241511821746826, "step": 9270 }, { "epoch": 0.29434920634920636, "grad_norm": 0.1318359375, "learning_rate": 0.1, "loss": 2.236694574356079, "step": 9272 }, { "epoch": 0.2944126984126984, "grad_norm": 0.1884765625, "learning_rate": 0.1, "loss": 2.229106903076172, "step": 9274 }, { "epoch": 0.2944761904761905, "grad_norm": 0.142578125, "learning_rate": 0.1, "loss": 2.1965904235839844, "step": 9276 }, { "epoch": 0.29453968253968255, "grad_norm": 0.1611328125, "learning_rate": 0.1, "loss": 2.2330551147460938, "step": 9278 }, { "epoch": 0.2946031746031746, "grad_norm": 0.1455078125, "learning_rate": 0.1, "loss": 2.2083864212036133, "step": 9280 }, { "epoch": 0.2946666666666667, "grad_norm": 0.17578125, "learning_rate": 0.1, "loss": 2.2197952270507812, "step": 9282 }, { "epoch": 0.29473015873015873, "grad_norm": 0.1064453125, "learning_rate": 0.1, "loss": 2.218032121658325, "step": 9284 }, { "epoch": 0.2947936507936508, "grad_norm": 0.2294921875, "learning_rate": 0.1, "loss": 2.200786590576172, "step": 9286 }, { "epoch": 0.2948571428571429, "grad_norm": 0.2099609375, "learning_rate": 0.1, "loss": 2.209676504135132, "step": 9288 }, { "epoch": 0.2949206349206349, "grad_norm": 0.1552734375, "learning_rate": 0.1, "loss": 2.21690034866333, "step": 9290 }, { "epoch": 0.29498412698412696, "grad_norm": 0.1513671875, "learning_rate": 0.1, "loss": 2.2035932540893555, "step": 9292 }, { "epoch": 0.29504761904761906, "grad_norm": 0.1904296875, "learning_rate": 0.1, "loss": 2.2060556411743164, "step": 9294 }, { "epoch": 0.2951111111111111, "grad_norm": 0.1689453125, "learning_rate": 0.1, "loss": 2.1963279247283936, "step": 9296 }, { "epoch": 0.2951746031746032, "grad_norm": 0.21484375, "learning_rate": 0.1, "loss": 2.2590255737304688, "step": 9298 }, { "epoch": 0.29523809523809524, "grad_norm": 0.2119140625, "learning_rate": 0.1, "loss": 2.2114124298095703, "step": 9300 }, { "epoch": 0.2953015873015873, "grad_norm": 0.11962890625, "learning_rate": 0.1, "loss": 2.2255983352661133, "step": 9302 }, { "epoch": 0.2953650793650794, "grad_norm": 0.2392578125, "learning_rate": 0.1, "loss": 2.215240001678467, "step": 9304 }, { "epoch": 0.29542857142857143, "grad_norm": 0.208984375, "learning_rate": 0.1, "loss": 2.2258694171905518, "step": 9306 }, { "epoch": 0.2954920634920635, "grad_norm": 0.2216796875, "learning_rate": 0.1, "loss": 2.2359232902526855, "step": 9308 }, { "epoch": 0.29555555555555557, "grad_norm": 0.353515625, "learning_rate": 0.1, "loss": 2.203122854232788, "step": 9310 }, { "epoch": 0.2956190476190476, "grad_norm": 0.1455078125, "learning_rate": 0.1, "loss": 2.2058568000793457, "step": 9312 }, { "epoch": 0.29568253968253966, "grad_norm": 0.103515625, "learning_rate": 0.1, "loss": 2.203873872756958, "step": 9314 }, { "epoch": 0.29574603174603176, "grad_norm": 0.09814453125, "learning_rate": 0.1, "loss": 2.200739860534668, "step": 9316 }, { "epoch": 0.2958095238095238, "grad_norm": 0.08251953125, "learning_rate": 0.1, "loss": 2.2148048877716064, "step": 9318 }, { "epoch": 0.2958730158730159, "grad_norm": 0.06298828125, "learning_rate": 0.1, "loss": 2.1995017528533936, "step": 9320 }, { "epoch": 0.29593650793650794, "grad_norm": 0.0908203125, "learning_rate": 0.1, "loss": 2.1934211254119873, "step": 9322 }, { "epoch": 0.296, "grad_norm": 0.0927734375, "learning_rate": 0.1, "loss": 2.1869144439697266, "step": 9324 }, { "epoch": 0.2960634920634921, "grad_norm": 0.1513671875, "learning_rate": 0.1, "loss": 2.1968014240264893, "step": 9326 }, { "epoch": 0.2961269841269841, "grad_norm": 0.1943359375, "learning_rate": 0.1, "loss": 2.1869211196899414, "step": 9328 }, { "epoch": 0.29619047619047617, "grad_norm": 0.6953125, "learning_rate": 0.1, "loss": 2.2098889350891113, "step": 9330 }, { "epoch": 0.29625396825396827, "grad_norm": 0.08642578125, "learning_rate": 0.1, "loss": 2.2024307250976562, "step": 9332 }, { "epoch": 0.2963174603174603, "grad_norm": 0.078125, "learning_rate": 0.1, "loss": 2.1853997707366943, "step": 9334 }, { "epoch": 0.29638095238095236, "grad_norm": 0.1103515625, "learning_rate": 0.1, "loss": 2.1813759803771973, "step": 9336 }, { "epoch": 0.29644444444444445, "grad_norm": 0.1259765625, "learning_rate": 0.1, "loss": 2.1846978664398193, "step": 9338 }, { "epoch": 0.2965079365079365, "grad_norm": 0.12109375, "learning_rate": 0.1, "loss": 2.22355318069458, "step": 9340 }, { "epoch": 0.2965714285714286, "grad_norm": 0.21875, "learning_rate": 0.1, "loss": 2.2147603034973145, "step": 9342 }, { "epoch": 0.29663492063492064, "grad_norm": 0.2138671875, "learning_rate": 0.1, "loss": 2.2199220657348633, "step": 9344 }, { "epoch": 0.2966984126984127, "grad_norm": 0.050537109375, "learning_rate": 0.1, "loss": 2.202453851699829, "step": 9346 }, { "epoch": 0.2967619047619048, "grad_norm": 0.123046875, "learning_rate": 0.1, "loss": 2.234222412109375, "step": 9348 }, { "epoch": 0.2968253968253968, "grad_norm": 0.1396484375, "learning_rate": 0.1, "loss": 2.2035272121429443, "step": 9350 }, { "epoch": 0.29688888888888887, "grad_norm": 0.10302734375, "learning_rate": 0.1, "loss": 2.2118725776672363, "step": 9352 }, { "epoch": 0.29695238095238097, "grad_norm": 0.0703125, "learning_rate": 0.1, "loss": 2.2161428928375244, "step": 9354 }, { "epoch": 0.297015873015873, "grad_norm": 0.1982421875, "learning_rate": 0.1, "loss": 2.2128984928131104, "step": 9356 }, { "epoch": 0.29707936507936505, "grad_norm": 0.16796875, "learning_rate": 0.1, "loss": 2.2070558071136475, "step": 9358 }, { "epoch": 0.29714285714285715, "grad_norm": 0.1337890625, "learning_rate": 0.1, "loss": 2.1847267150878906, "step": 9360 }, { "epoch": 0.2972063492063492, "grad_norm": 0.1240234375, "learning_rate": 0.1, "loss": 2.189819574356079, "step": 9362 }, { "epoch": 0.2972698412698413, "grad_norm": 0.185546875, "learning_rate": 0.1, "loss": 2.2178409099578857, "step": 9364 }, { "epoch": 0.29733333333333334, "grad_norm": 0.16796875, "learning_rate": 0.1, "loss": 2.184770107269287, "step": 9366 }, { "epoch": 0.2973968253968254, "grad_norm": 0.1484375, "learning_rate": 0.1, "loss": 2.163674831390381, "step": 9368 }, { "epoch": 0.2974603174603175, "grad_norm": 0.11669921875, "learning_rate": 0.1, "loss": 2.2226927280426025, "step": 9370 }, { "epoch": 0.2975238095238095, "grad_norm": 0.15234375, "learning_rate": 0.1, "loss": 2.2259249687194824, "step": 9372 }, { "epoch": 0.29758730158730157, "grad_norm": 0.14453125, "learning_rate": 0.1, "loss": 2.205202341079712, "step": 9374 }, { "epoch": 0.29765079365079367, "grad_norm": 0.17578125, "learning_rate": 0.1, "loss": 2.1762137413024902, "step": 9376 }, { "epoch": 0.2977142857142857, "grad_norm": 0.1533203125, "learning_rate": 0.1, "loss": 2.201294422149658, "step": 9378 }, { "epoch": 0.29777777777777775, "grad_norm": 0.392578125, "learning_rate": 0.1, "loss": 2.186765432357788, "step": 9380 }, { "epoch": 0.29784126984126985, "grad_norm": 0.1396484375, "learning_rate": 0.1, "loss": 2.193049192428589, "step": 9382 }, { "epoch": 0.2979047619047619, "grad_norm": 0.1181640625, "learning_rate": 0.1, "loss": 2.20552396774292, "step": 9384 }, { "epoch": 0.297968253968254, "grad_norm": 0.09765625, "learning_rate": 0.1, "loss": 2.1927225589752197, "step": 9386 }, { "epoch": 0.29803174603174604, "grad_norm": 0.2373046875, "learning_rate": 0.1, "loss": 2.2209129333496094, "step": 9388 }, { "epoch": 0.2980952380952381, "grad_norm": 0.1767578125, "learning_rate": 0.1, "loss": 2.1645257472991943, "step": 9390 }, { "epoch": 0.2981587301587302, "grad_norm": 0.287109375, "learning_rate": 0.1, "loss": 2.1931402683258057, "step": 9392 }, { "epoch": 0.2982222222222222, "grad_norm": 0.11767578125, "learning_rate": 0.1, "loss": 2.182758092880249, "step": 9394 }, { "epoch": 0.29828571428571427, "grad_norm": 0.134765625, "learning_rate": 0.1, "loss": 2.190619945526123, "step": 9396 }, { "epoch": 0.29834920634920636, "grad_norm": 0.166015625, "learning_rate": 0.1, "loss": 2.1963634490966797, "step": 9398 }, { "epoch": 0.2984126984126984, "grad_norm": 0.203125, "learning_rate": 0.1, "loss": 2.1873092651367188, "step": 9400 }, { "epoch": 0.29847619047619045, "grad_norm": 0.08740234375, "learning_rate": 0.1, "loss": 2.1481757164001465, "step": 9402 }, { "epoch": 0.29853968253968255, "grad_norm": 0.1298828125, "learning_rate": 0.1, "loss": 2.1795966625213623, "step": 9404 }, { "epoch": 0.2986031746031746, "grad_norm": 0.08837890625, "learning_rate": 0.1, "loss": 2.1823086738586426, "step": 9406 }, { "epoch": 0.2986666666666667, "grad_norm": 0.11865234375, "learning_rate": 0.1, "loss": 2.181117534637451, "step": 9408 }, { "epoch": 0.29873015873015873, "grad_norm": 0.10693359375, "learning_rate": 0.1, "loss": 2.210858106613159, "step": 9410 }, { "epoch": 0.2987936507936508, "grad_norm": 0.109375, "learning_rate": 0.1, "loss": 2.18422532081604, "step": 9412 }, { "epoch": 0.2988571428571429, "grad_norm": 0.109375, "learning_rate": 0.1, "loss": 2.1274092197418213, "step": 9414 }, { "epoch": 0.2989206349206349, "grad_norm": 0.11181640625, "learning_rate": 0.1, "loss": 2.1593363285064697, "step": 9416 }, { "epoch": 0.29898412698412696, "grad_norm": 0.216796875, "learning_rate": 0.1, "loss": 2.144622325897217, "step": 9418 }, { "epoch": 0.29904761904761906, "grad_norm": 0.265625, "learning_rate": 0.1, "loss": 2.1944730281829834, "step": 9420 }, { "epoch": 0.2991111111111111, "grad_norm": 0.26953125, "learning_rate": 0.1, "loss": 2.1634318828582764, "step": 9422 }, { "epoch": 0.29917460317460315, "grad_norm": 0.1259765625, "learning_rate": 0.1, "loss": 2.182488203048706, "step": 9424 }, { "epoch": 0.29923809523809525, "grad_norm": 0.2470703125, "learning_rate": 0.1, "loss": 2.190380573272705, "step": 9426 }, { "epoch": 0.2993015873015873, "grad_norm": 0.185546875, "learning_rate": 0.1, "loss": 2.1981050968170166, "step": 9428 }, { "epoch": 0.2993650793650794, "grad_norm": 0.0927734375, "learning_rate": 0.1, "loss": 2.1723875999450684, "step": 9430 }, { "epoch": 0.29942857142857143, "grad_norm": 0.048828125, "learning_rate": 0.1, "loss": 2.1620962619781494, "step": 9432 }, { "epoch": 0.2994920634920635, "grad_norm": 0.1298828125, "learning_rate": 0.1, "loss": 2.1711692810058594, "step": 9434 }, { "epoch": 0.2995555555555556, "grad_norm": 0.2080078125, "learning_rate": 0.1, "loss": 2.198500633239746, "step": 9436 }, { "epoch": 0.2996190476190476, "grad_norm": 0.2138671875, "learning_rate": 0.1, "loss": 2.1752126216888428, "step": 9438 }, { "epoch": 0.29968253968253966, "grad_norm": 0.318359375, "learning_rate": 0.1, "loss": 2.191533327102661, "step": 9440 }, { "epoch": 0.29974603174603176, "grad_norm": 0.10791015625, "learning_rate": 0.1, "loss": 2.150499105453491, "step": 9442 }, { "epoch": 0.2998095238095238, "grad_norm": 0.08935546875, "learning_rate": 0.1, "loss": 2.1823630332946777, "step": 9444 }, { "epoch": 0.29987301587301585, "grad_norm": 0.1513671875, "learning_rate": 0.1, "loss": 2.1956892013549805, "step": 9446 }, { "epoch": 0.29993650793650795, "grad_norm": 0.0830078125, "learning_rate": 0.1, "loss": 2.1509580612182617, "step": 9448 }, { "epoch": 0.3, "grad_norm": 0.12158203125, "learning_rate": 0.1, "loss": 2.137407064437866, "step": 9450 }, { "epoch": 0.3, "eval_loss": 1.7686119079589844, "eval_runtime": 113.4854, "eval_samples_per_second": 9.358, "eval_steps_per_second": 2.344, "step": 9450 }, { "epoch": 0.3000634920634921, "grad_norm": 0.1630859375, "learning_rate": 0.1, "loss": 2.1472859382629395, "step": 9452 }, { "epoch": 0.30012698412698413, "grad_norm": 0.296875, "learning_rate": 0.1, "loss": 2.1760661602020264, "step": 9454 }, { "epoch": 0.3001904761904762, "grad_norm": 0.1943359375, "learning_rate": 0.1, "loss": 2.1648714542388916, "step": 9456 }, { "epoch": 0.3002539682539683, "grad_norm": 0.150390625, "learning_rate": 0.1, "loss": 2.188153028488159, "step": 9458 }, { "epoch": 0.3003174603174603, "grad_norm": 0.1025390625, "learning_rate": 0.1, "loss": 2.1523313522338867, "step": 9460 }, { "epoch": 0.30038095238095236, "grad_norm": 0.3125, "learning_rate": 0.1, "loss": 2.1909072399139404, "step": 9462 }, { "epoch": 0.30044444444444446, "grad_norm": 0.19140625, "learning_rate": 0.1, "loss": 2.1463563442230225, "step": 9464 }, { "epoch": 0.3005079365079365, "grad_norm": 0.271484375, "learning_rate": 0.1, "loss": 2.1846697330474854, "step": 9466 }, { "epoch": 0.30057142857142854, "grad_norm": 0.1328125, "learning_rate": 0.1, "loss": 2.1677868366241455, "step": 9468 }, { "epoch": 0.30063492063492064, "grad_norm": 0.189453125, "learning_rate": 0.1, "loss": 2.1488208770751953, "step": 9470 }, { "epoch": 0.3006984126984127, "grad_norm": 0.1455078125, "learning_rate": 0.1, "loss": 2.1191906929016113, "step": 9472 }, { "epoch": 0.3007619047619048, "grad_norm": 0.056884765625, "learning_rate": 0.1, "loss": 2.1638405323028564, "step": 9474 }, { "epoch": 0.30082539682539683, "grad_norm": 0.1328125, "learning_rate": 0.1, "loss": 2.178299903869629, "step": 9476 }, { "epoch": 0.3008888888888889, "grad_norm": 0.130859375, "learning_rate": 0.1, "loss": 2.1747636795043945, "step": 9478 }, { "epoch": 0.30095238095238097, "grad_norm": 0.1923828125, "learning_rate": 0.1, "loss": 2.148120403289795, "step": 9480 }, { "epoch": 0.301015873015873, "grad_norm": 0.09326171875, "learning_rate": 0.1, "loss": 2.128260850906372, "step": 9482 }, { "epoch": 0.30107936507936506, "grad_norm": 0.208984375, "learning_rate": 0.1, "loss": 2.166790723800659, "step": 9484 }, { "epoch": 0.30114285714285716, "grad_norm": 0.294921875, "learning_rate": 0.1, "loss": 2.1461851596832275, "step": 9486 }, { "epoch": 0.3012063492063492, "grad_norm": 0.2060546875, "learning_rate": 0.1, "loss": 2.1558611392974854, "step": 9488 }, { "epoch": 0.30126984126984124, "grad_norm": 0.1416015625, "learning_rate": 0.1, "loss": 2.180940866470337, "step": 9490 }, { "epoch": 0.30133333333333334, "grad_norm": 0.130859375, "learning_rate": 0.1, "loss": 2.166804790496826, "step": 9492 }, { "epoch": 0.3013968253968254, "grad_norm": 0.0859375, "learning_rate": 0.1, "loss": 2.155484914779663, "step": 9494 }, { "epoch": 0.3014603174603175, "grad_norm": 0.04736328125, "learning_rate": 0.1, "loss": 2.181360960006714, "step": 9496 }, { "epoch": 0.3015238095238095, "grad_norm": 0.34375, "learning_rate": 0.1, "loss": 2.146151542663574, "step": 9498 }, { "epoch": 0.30158730158730157, "grad_norm": 0.208984375, "learning_rate": 0.1, "loss": 2.18393611907959, "step": 9500 }, { "epoch": 0.30165079365079367, "grad_norm": 0.1005859375, "learning_rate": 0.1, "loss": 2.155238389968872, "step": 9502 }, { "epoch": 0.3017142857142857, "grad_norm": 0.09912109375, "learning_rate": 0.1, "loss": 2.1201324462890625, "step": 9504 }, { "epoch": 0.30177777777777776, "grad_norm": 0.1669921875, "learning_rate": 0.1, "loss": 2.1370770931243896, "step": 9506 }, { "epoch": 0.30184126984126985, "grad_norm": 0.1455078125, "learning_rate": 0.1, "loss": 2.1706223487854004, "step": 9508 }, { "epoch": 0.3019047619047619, "grad_norm": 0.087890625, "learning_rate": 0.1, "loss": 2.1612496376037598, "step": 9510 }, { "epoch": 0.30196825396825394, "grad_norm": 0.09033203125, "learning_rate": 0.1, "loss": 2.1535305976867676, "step": 9512 }, { "epoch": 0.30203174603174604, "grad_norm": 0.09326171875, "learning_rate": 0.1, "loss": 2.1455745697021484, "step": 9514 }, { "epoch": 0.3020952380952381, "grad_norm": 0.359375, "learning_rate": 0.1, "loss": 2.127373218536377, "step": 9516 }, { "epoch": 0.3021587301587302, "grad_norm": 0.08203125, "learning_rate": 0.1, "loss": 2.1585702896118164, "step": 9518 }, { "epoch": 0.3022222222222222, "grad_norm": 0.08837890625, "learning_rate": 0.1, "loss": 2.1447198390960693, "step": 9520 }, { "epoch": 0.30228571428571427, "grad_norm": 0.0888671875, "learning_rate": 0.1, "loss": 2.17907977104187, "step": 9522 }, { "epoch": 0.30234920634920637, "grad_norm": 0.0810546875, "learning_rate": 0.1, "loss": 2.159121513366699, "step": 9524 }, { "epoch": 0.3024126984126984, "grad_norm": 0.2451171875, "learning_rate": 0.1, "loss": 2.143207550048828, "step": 9526 }, { "epoch": 0.30247619047619045, "grad_norm": 0.12109375, "learning_rate": 0.1, "loss": 2.159604787826538, "step": 9528 }, { "epoch": 0.30253968253968255, "grad_norm": 0.2578125, "learning_rate": 0.1, "loss": 2.1704049110412598, "step": 9530 }, { "epoch": 0.3026031746031746, "grad_norm": 0.2158203125, "learning_rate": 0.1, "loss": 2.163226366043091, "step": 9532 }, { "epoch": 0.30266666666666664, "grad_norm": 0.19921875, "learning_rate": 0.1, "loss": 2.1636879444122314, "step": 9534 }, { "epoch": 0.30273015873015874, "grad_norm": 0.04931640625, "learning_rate": 0.1, "loss": 2.1606156826019287, "step": 9536 }, { "epoch": 0.3027936507936508, "grad_norm": 0.142578125, "learning_rate": 0.1, "loss": 2.1608684062957764, "step": 9538 }, { "epoch": 0.3028571428571429, "grad_norm": 0.1015625, "learning_rate": 0.1, "loss": 2.158083915710449, "step": 9540 }, { "epoch": 0.3029206349206349, "grad_norm": 0.053466796875, "learning_rate": 0.1, "loss": 2.1202549934387207, "step": 9542 }, { "epoch": 0.30298412698412697, "grad_norm": 0.06591796875, "learning_rate": 0.1, "loss": 2.142521858215332, "step": 9544 }, { "epoch": 0.30304761904761907, "grad_norm": 0.1484375, "learning_rate": 0.1, "loss": 2.1324398517608643, "step": 9546 }, { "epoch": 0.3031111111111111, "grad_norm": 0.40234375, "learning_rate": 0.1, "loss": 2.1522200107574463, "step": 9548 }, { "epoch": 0.30317460317460315, "grad_norm": 0.28125, "learning_rate": 0.1, "loss": 2.1225717067718506, "step": 9550 }, { "epoch": 0.30323809523809525, "grad_norm": 0.1162109375, "learning_rate": 0.1, "loss": 2.1457066535949707, "step": 9552 }, { "epoch": 0.3033015873015873, "grad_norm": 0.119140625, "learning_rate": 0.1, "loss": 2.105722188949585, "step": 9554 }, { "epoch": 0.30336507936507934, "grad_norm": 0.2890625, "learning_rate": 0.1, "loss": 2.173302173614502, "step": 9556 }, { "epoch": 0.30342857142857144, "grad_norm": 0.154296875, "learning_rate": 0.1, "loss": 2.1082375049591064, "step": 9558 }, { "epoch": 0.3034920634920635, "grad_norm": 0.10302734375, "learning_rate": 0.1, "loss": 2.1398470401763916, "step": 9560 }, { "epoch": 0.3035555555555556, "grad_norm": 0.13671875, "learning_rate": 0.1, "loss": 2.1604502201080322, "step": 9562 }, { "epoch": 0.3036190476190476, "grad_norm": 0.2890625, "learning_rate": 0.1, "loss": 2.1410305500030518, "step": 9564 }, { "epoch": 0.30368253968253967, "grad_norm": 0.052978515625, "learning_rate": 0.1, "loss": 2.17844295501709, "step": 9566 }, { "epoch": 0.30374603174603176, "grad_norm": 0.1640625, "learning_rate": 0.1, "loss": 2.141510009765625, "step": 9568 }, { "epoch": 0.3038095238095238, "grad_norm": 0.15234375, "learning_rate": 0.1, "loss": 2.1670103073120117, "step": 9570 }, { "epoch": 0.30387301587301585, "grad_norm": 0.0595703125, "learning_rate": 0.1, "loss": 2.137498140335083, "step": 9572 }, { "epoch": 0.30393650793650795, "grad_norm": 0.1708984375, "learning_rate": 0.1, "loss": 2.136523485183716, "step": 9574 }, { "epoch": 0.304, "grad_norm": 0.11962890625, "learning_rate": 0.1, "loss": 2.1560111045837402, "step": 9576 }, { "epoch": 0.3040634920634921, "grad_norm": 0.11279296875, "learning_rate": 0.1, "loss": 2.1164608001708984, "step": 9578 }, { "epoch": 0.30412698412698413, "grad_norm": 0.103515625, "learning_rate": 0.1, "loss": 2.150451421737671, "step": 9580 }, { "epoch": 0.3041904761904762, "grad_norm": 0.0849609375, "learning_rate": 0.1, "loss": 2.1523709297180176, "step": 9582 }, { "epoch": 0.3042539682539683, "grad_norm": 0.08740234375, "learning_rate": 0.1, "loss": 2.161597967147827, "step": 9584 }, { "epoch": 0.3043174603174603, "grad_norm": 0.1337890625, "learning_rate": 0.1, "loss": 2.150855302810669, "step": 9586 }, { "epoch": 0.30438095238095236, "grad_norm": 0.189453125, "learning_rate": 0.1, "loss": 2.1620330810546875, "step": 9588 }, { "epoch": 0.30444444444444446, "grad_norm": 0.267578125, "learning_rate": 0.1, "loss": 2.1669695377349854, "step": 9590 }, { "epoch": 0.3045079365079365, "grad_norm": 0.30859375, "learning_rate": 0.1, "loss": 2.133589267730713, "step": 9592 }, { "epoch": 0.30457142857142855, "grad_norm": 0.058349609375, "learning_rate": 0.1, "loss": 2.1311776638031006, "step": 9594 }, { "epoch": 0.30463492063492065, "grad_norm": 0.1318359375, "learning_rate": 0.1, "loss": 2.1437864303588867, "step": 9596 }, { "epoch": 0.3046984126984127, "grad_norm": 0.1982421875, "learning_rate": 0.1, "loss": 2.135451316833496, "step": 9598 }, { "epoch": 0.3047619047619048, "grad_norm": 0.275390625, "learning_rate": 0.1, "loss": 2.163165330886841, "step": 9600 }, { "epoch": 0.30482539682539683, "grad_norm": 0.2314453125, "learning_rate": 0.1, "loss": 2.1614646911621094, "step": 9602 }, { "epoch": 0.3048888888888889, "grad_norm": 0.2021484375, "learning_rate": 0.1, "loss": 2.152757406234741, "step": 9604 }, { "epoch": 0.304952380952381, "grad_norm": 0.201171875, "learning_rate": 0.1, "loss": 2.194638252258301, "step": 9606 }, { "epoch": 0.305015873015873, "grad_norm": 0.10009765625, "learning_rate": 0.1, "loss": 2.1496658325195312, "step": 9608 }, { "epoch": 0.30507936507936506, "grad_norm": 0.06640625, "learning_rate": 0.1, "loss": 2.1046526432037354, "step": 9610 }, { "epoch": 0.30514285714285716, "grad_norm": 0.1455078125, "learning_rate": 0.1, "loss": 2.180330753326416, "step": 9612 }, { "epoch": 0.3052063492063492, "grad_norm": 0.201171875, "learning_rate": 0.1, "loss": 2.182927131652832, "step": 9614 }, { "epoch": 0.30526984126984125, "grad_norm": 0.265625, "learning_rate": 0.1, "loss": 2.182036876678467, "step": 9616 }, { "epoch": 0.30533333333333335, "grad_norm": 0.2021484375, "learning_rate": 0.1, "loss": 2.1367194652557373, "step": 9618 }, { "epoch": 0.3053968253968254, "grad_norm": 0.0673828125, "learning_rate": 0.1, "loss": 2.161522388458252, "step": 9620 }, { "epoch": 0.3054603174603175, "grad_norm": 0.06884765625, "learning_rate": 0.1, "loss": 2.1784839630126953, "step": 9622 }, { "epoch": 0.30552380952380953, "grad_norm": 0.2314453125, "learning_rate": 0.1, "loss": 2.1816060543060303, "step": 9624 }, { "epoch": 0.3055873015873016, "grad_norm": 0.2236328125, "learning_rate": 0.1, "loss": 2.159952163696289, "step": 9626 }, { "epoch": 0.3056507936507937, "grad_norm": 0.1923828125, "learning_rate": 0.1, "loss": 2.157240867614746, "step": 9628 }, { "epoch": 0.3057142857142857, "grad_norm": 0.201171875, "learning_rate": 0.1, "loss": 2.1438405513763428, "step": 9630 }, { "epoch": 0.30577777777777776, "grad_norm": 0.15234375, "learning_rate": 0.1, "loss": 2.1715211868286133, "step": 9632 }, { "epoch": 0.30584126984126986, "grad_norm": 0.27734375, "learning_rate": 0.1, "loss": 2.183619499206543, "step": 9634 }, { "epoch": 0.3059047619047619, "grad_norm": 0.03857421875, "learning_rate": 0.1, "loss": 2.180736780166626, "step": 9636 }, { "epoch": 0.30596825396825394, "grad_norm": 0.2041015625, "learning_rate": 0.1, "loss": 2.1635658740997314, "step": 9638 }, { "epoch": 0.30603174603174604, "grad_norm": 0.384765625, "learning_rate": 0.1, "loss": 2.1773273944854736, "step": 9640 }, { "epoch": 0.3060952380952381, "grad_norm": 0.279296875, "learning_rate": 0.1, "loss": 2.155907392501831, "step": 9642 }, { "epoch": 0.3061587301587302, "grad_norm": 0.06103515625, "learning_rate": 0.1, "loss": 2.1481211185455322, "step": 9644 }, { "epoch": 0.30622222222222223, "grad_norm": 0.10302734375, "learning_rate": 0.1, "loss": 2.1487865447998047, "step": 9646 }, { "epoch": 0.3062857142857143, "grad_norm": 0.09716796875, "learning_rate": 0.1, "loss": 2.1521055698394775, "step": 9648 }, { "epoch": 0.30634920634920637, "grad_norm": 0.1689453125, "learning_rate": 0.1, "loss": 2.1854195594787598, "step": 9650 }, { "epoch": 0.3064126984126984, "grad_norm": 0.11865234375, "learning_rate": 0.1, "loss": 2.1558547019958496, "step": 9652 }, { "epoch": 0.30647619047619046, "grad_norm": 0.10302734375, "learning_rate": 0.1, "loss": 2.1694235801696777, "step": 9654 }, { "epoch": 0.30653968253968256, "grad_norm": 0.1533203125, "learning_rate": 0.1, "loss": 2.175575017929077, "step": 9656 }, { "epoch": 0.3066031746031746, "grad_norm": 0.431640625, "learning_rate": 0.1, "loss": 2.1485464572906494, "step": 9658 }, { "epoch": 0.30666666666666664, "grad_norm": 0.1962890625, "learning_rate": 0.1, "loss": 2.174659013748169, "step": 9660 }, { "epoch": 0.30673015873015874, "grad_norm": 0.1611328125, "learning_rate": 0.1, "loss": 2.174217700958252, "step": 9662 }, { "epoch": 0.3067936507936508, "grad_norm": 0.1689453125, "learning_rate": 0.1, "loss": 2.134193181991577, "step": 9664 }, { "epoch": 0.3068571428571429, "grad_norm": 0.18359375, "learning_rate": 0.1, "loss": 2.143662691116333, "step": 9666 }, { "epoch": 0.3069206349206349, "grad_norm": 0.2255859375, "learning_rate": 0.1, "loss": 2.1857709884643555, "step": 9668 }, { "epoch": 0.30698412698412697, "grad_norm": 0.09423828125, "learning_rate": 0.1, "loss": 2.167422294616699, "step": 9670 }, { "epoch": 0.30704761904761907, "grad_norm": 0.064453125, "learning_rate": 0.1, "loss": 2.1751763820648193, "step": 9672 }, { "epoch": 0.3071111111111111, "grad_norm": 0.091796875, "learning_rate": 0.1, "loss": 2.1585495471954346, "step": 9674 }, { "epoch": 0.30717460317460316, "grad_norm": 0.130859375, "learning_rate": 0.1, "loss": 2.178903102874756, "step": 9676 }, { "epoch": 0.30723809523809525, "grad_norm": 0.126953125, "learning_rate": 0.1, "loss": 2.1489145755767822, "step": 9678 }, { "epoch": 0.3073015873015873, "grad_norm": 0.07421875, "learning_rate": 0.1, "loss": 2.1561834812164307, "step": 9680 }, { "epoch": 0.30736507936507934, "grad_norm": 0.1357421875, "learning_rate": 0.1, "loss": 2.167952537536621, "step": 9682 }, { "epoch": 0.30742857142857144, "grad_norm": 0.07177734375, "learning_rate": 0.1, "loss": 2.1573386192321777, "step": 9684 }, { "epoch": 0.3074920634920635, "grad_norm": 0.12890625, "learning_rate": 0.1, "loss": 2.196101427078247, "step": 9686 }, { "epoch": 0.3075555555555556, "grad_norm": 0.353515625, "learning_rate": 0.1, "loss": 2.1620426177978516, "step": 9688 }, { "epoch": 0.3076190476190476, "grad_norm": 0.376953125, "learning_rate": 0.1, "loss": 2.1632256507873535, "step": 9690 }, { "epoch": 0.30768253968253967, "grad_norm": 0.193359375, "learning_rate": 0.1, "loss": 2.1510872840881348, "step": 9692 }, { "epoch": 0.30774603174603177, "grad_norm": 0.06689453125, "learning_rate": 0.1, "loss": 2.159586191177368, "step": 9694 }, { "epoch": 0.3078095238095238, "grad_norm": 0.09033203125, "learning_rate": 0.1, "loss": 2.167750120162964, "step": 9696 }, { "epoch": 0.30787301587301585, "grad_norm": 0.125, "learning_rate": 0.1, "loss": 2.125933885574341, "step": 9698 }, { "epoch": 0.30793650793650795, "grad_norm": 0.05810546875, "learning_rate": 0.1, "loss": 2.159409761428833, "step": 9700 }, { "epoch": 0.308, "grad_norm": 0.123046875, "learning_rate": 0.1, "loss": 2.156834363937378, "step": 9702 }, { "epoch": 0.30806349206349204, "grad_norm": 0.11279296875, "learning_rate": 0.1, "loss": 2.1724915504455566, "step": 9704 }, { "epoch": 0.30812698412698414, "grad_norm": 0.1630859375, "learning_rate": 0.1, "loss": 2.1611783504486084, "step": 9706 }, { "epoch": 0.3081904761904762, "grad_norm": 0.310546875, "learning_rate": 0.1, "loss": 2.1475260257720947, "step": 9708 }, { "epoch": 0.3082539682539683, "grad_norm": 0.388671875, "learning_rate": 0.1, "loss": 2.1348135471343994, "step": 9710 }, { "epoch": 0.3083174603174603, "grad_norm": 0.107421875, "learning_rate": 0.1, "loss": 2.176448106765747, "step": 9712 }, { "epoch": 0.30838095238095237, "grad_norm": 0.12060546875, "learning_rate": 0.1, "loss": 2.156642198562622, "step": 9714 }, { "epoch": 0.30844444444444447, "grad_norm": 0.1865234375, "learning_rate": 0.1, "loss": 2.191237688064575, "step": 9716 }, { "epoch": 0.3085079365079365, "grad_norm": 0.19921875, "learning_rate": 0.1, "loss": 2.1917574405670166, "step": 9718 }, { "epoch": 0.30857142857142855, "grad_norm": 0.1513671875, "learning_rate": 0.1, "loss": 2.170391082763672, "step": 9720 }, { "epoch": 0.30863492063492065, "grad_norm": 0.12255859375, "learning_rate": 0.1, "loss": 2.1661112308502197, "step": 9722 }, { "epoch": 0.3086984126984127, "grad_norm": 0.125, "learning_rate": 0.1, "loss": 2.190272331237793, "step": 9724 }, { "epoch": 0.30876190476190474, "grad_norm": 0.125, "learning_rate": 0.1, "loss": 2.1798899173736572, "step": 9726 }, { "epoch": 0.30882539682539684, "grad_norm": 0.080078125, "learning_rate": 0.1, "loss": 2.142207145690918, "step": 9728 }, { "epoch": 0.3088888888888889, "grad_norm": 0.059326171875, "learning_rate": 0.1, "loss": 2.1621594429016113, "step": 9730 }, { "epoch": 0.308952380952381, "grad_norm": 0.10400390625, "learning_rate": 0.1, "loss": 2.154935121536255, "step": 9732 }, { "epoch": 0.309015873015873, "grad_norm": 0.1806640625, "learning_rate": 0.1, "loss": 2.1686010360717773, "step": 9734 }, { "epoch": 0.30907936507936506, "grad_norm": 0.058349609375, "learning_rate": 0.1, "loss": 2.1836397647857666, "step": 9736 }, { "epoch": 0.30914285714285716, "grad_norm": 0.08447265625, "learning_rate": 0.1, "loss": 2.1993484497070312, "step": 9738 }, { "epoch": 0.3092063492063492, "grad_norm": 0.166015625, "learning_rate": 0.1, "loss": 2.1621124744415283, "step": 9740 }, { "epoch": 0.30926984126984125, "grad_norm": 0.1845703125, "learning_rate": 0.1, "loss": 2.1764113903045654, "step": 9742 }, { "epoch": 0.30933333333333335, "grad_norm": 0.3125, "learning_rate": 0.1, "loss": 2.1784377098083496, "step": 9744 }, { "epoch": 0.3093968253968254, "grad_norm": 0.224609375, "learning_rate": 0.1, "loss": 2.166985273361206, "step": 9746 }, { "epoch": 0.30946031746031744, "grad_norm": 0.453125, "learning_rate": 0.1, "loss": 2.159867286682129, "step": 9748 }, { "epoch": 0.30952380952380953, "grad_norm": 0.058349609375, "learning_rate": 0.1, "loss": 2.175865888595581, "step": 9750 }, { "epoch": 0.3095873015873016, "grad_norm": 0.1572265625, "learning_rate": 0.1, "loss": 2.1457912921905518, "step": 9752 }, { "epoch": 0.3096507936507937, "grad_norm": 0.0439453125, "learning_rate": 0.1, "loss": 2.171773672103882, "step": 9754 }, { "epoch": 0.3097142857142857, "grad_norm": 0.08984375, "learning_rate": 0.1, "loss": 2.1625921726226807, "step": 9756 }, { "epoch": 0.30977777777777776, "grad_norm": 0.1494140625, "learning_rate": 0.1, "loss": 2.1568188667297363, "step": 9758 }, { "epoch": 0.30984126984126986, "grad_norm": 0.07568359375, "learning_rate": 0.1, "loss": 2.1812832355499268, "step": 9760 }, { "epoch": 0.3099047619047619, "grad_norm": 0.1318359375, "learning_rate": 0.1, "loss": 2.2097373008728027, "step": 9762 }, { "epoch": 0.30996825396825395, "grad_norm": 0.154296875, "learning_rate": 0.1, "loss": 2.1471362113952637, "step": 9764 }, { "epoch": 0.31003174603174605, "grad_norm": 0.0732421875, "learning_rate": 0.1, "loss": 2.193241596221924, "step": 9766 }, { "epoch": 0.3100952380952381, "grad_norm": 0.111328125, "learning_rate": 0.1, "loss": 2.1627867221832275, "step": 9768 }, { "epoch": 0.31015873015873013, "grad_norm": 0.2060546875, "learning_rate": 0.1, "loss": 2.186823844909668, "step": 9770 }, { "epoch": 0.31022222222222223, "grad_norm": 0.2412109375, "learning_rate": 0.1, "loss": 2.165410280227661, "step": 9772 }, { "epoch": 0.3102857142857143, "grad_norm": 0.115234375, "learning_rate": 0.1, "loss": 2.191713809967041, "step": 9774 }, { "epoch": 0.3103492063492064, "grad_norm": 0.058349609375, "learning_rate": 0.1, "loss": 2.1742026805877686, "step": 9776 }, { "epoch": 0.3104126984126984, "grad_norm": 0.0888671875, "learning_rate": 0.1, "loss": 2.170882225036621, "step": 9778 }, { "epoch": 0.31047619047619046, "grad_norm": 0.212890625, "learning_rate": 0.1, "loss": 2.167639970779419, "step": 9780 }, { "epoch": 0.31053968253968256, "grad_norm": 0.2119140625, "learning_rate": 0.1, "loss": 2.1710586547851562, "step": 9782 }, { "epoch": 0.3106031746031746, "grad_norm": 0.271484375, "learning_rate": 0.1, "loss": 2.2063710689544678, "step": 9784 }, { "epoch": 0.31066666666666665, "grad_norm": 0.259765625, "learning_rate": 0.1, "loss": 2.181265115737915, "step": 9786 }, { "epoch": 0.31073015873015875, "grad_norm": 0.0673828125, "learning_rate": 0.1, "loss": 2.1840031147003174, "step": 9788 }, { "epoch": 0.3107936507936508, "grad_norm": 0.1376953125, "learning_rate": 0.1, "loss": 2.1821494102478027, "step": 9790 }, { "epoch": 0.31085714285714283, "grad_norm": 0.2431640625, "learning_rate": 0.1, "loss": 2.1770834922790527, "step": 9792 }, { "epoch": 0.31092063492063493, "grad_norm": 0.263671875, "learning_rate": 0.1, "loss": 2.163947105407715, "step": 9794 }, { "epoch": 0.310984126984127, "grad_norm": 0.205078125, "learning_rate": 0.1, "loss": 2.1887195110321045, "step": 9796 }, { "epoch": 0.3110476190476191, "grad_norm": 0.08935546875, "learning_rate": 0.1, "loss": 2.1659390926361084, "step": 9798 }, { "epoch": 0.3111111111111111, "grad_norm": 0.10791015625, "learning_rate": 0.1, "loss": 2.1604509353637695, "step": 9800 }, { "epoch": 0.31117460317460316, "grad_norm": 0.06640625, "learning_rate": 0.1, "loss": 2.193321466445923, "step": 9802 }, { "epoch": 0.31123809523809526, "grad_norm": 0.053466796875, "learning_rate": 0.1, "loss": 2.1531283855438232, "step": 9804 }, { "epoch": 0.3113015873015873, "grad_norm": 0.0751953125, "learning_rate": 0.1, "loss": 2.1906158924102783, "step": 9806 }, { "epoch": 0.31136507936507934, "grad_norm": 0.31640625, "learning_rate": 0.1, "loss": 2.2052948474884033, "step": 9808 }, { "epoch": 0.31142857142857144, "grad_norm": 0.359375, "learning_rate": 0.1, "loss": 2.20705246925354, "step": 9810 }, { "epoch": 0.3114920634920635, "grad_norm": 0.08154296875, "learning_rate": 0.1, "loss": 2.1867713928222656, "step": 9812 }, { "epoch": 0.31155555555555553, "grad_norm": 0.053955078125, "learning_rate": 0.1, "loss": 2.185459852218628, "step": 9814 }, { "epoch": 0.31161904761904763, "grad_norm": 0.061279296875, "learning_rate": 0.1, "loss": 2.192904233932495, "step": 9816 }, { "epoch": 0.31168253968253967, "grad_norm": 0.15625, "learning_rate": 0.1, "loss": 2.16719651222229, "step": 9818 }, { "epoch": 0.31174603174603177, "grad_norm": 0.205078125, "learning_rate": 0.1, "loss": 2.184863567352295, "step": 9820 }, { "epoch": 0.3118095238095238, "grad_norm": 0.19140625, "learning_rate": 0.1, "loss": 2.1928882598876953, "step": 9822 }, { "epoch": 0.31187301587301586, "grad_norm": 0.2294921875, "learning_rate": 0.1, "loss": 2.1773009300231934, "step": 9824 }, { "epoch": 0.31193650793650796, "grad_norm": 0.10595703125, "learning_rate": 0.1, "loss": 2.187877893447876, "step": 9826 }, { "epoch": 0.312, "grad_norm": 0.1435546875, "learning_rate": 0.1, "loss": 2.187873363494873, "step": 9828 }, { "epoch": 0.31206349206349204, "grad_norm": 0.0546875, "learning_rate": 0.1, "loss": 2.2156882286071777, "step": 9830 }, { "epoch": 0.31212698412698414, "grad_norm": 0.1845703125, "learning_rate": 0.1, "loss": 2.1722946166992188, "step": 9832 }, { "epoch": 0.3121904761904762, "grad_norm": 0.154296875, "learning_rate": 0.1, "loss": 2.180370330810547, "step": 9834 }, { "epoch": 0.31225396825396823, "grad_norm": 0.0634765625, "learning_rate": 0.1, "loss": 2.1398494243621826, "step": 9836 }, { "epoch": 0.3123174603174603, "grad_norm": 0.3125, "learning_rate": 0.1, "loss": 2.190025806427002, "step": 9838 }, { "epoch": 0.31238095238095237, "grad_norm": 0.291015625, "learning_rate": 0.1, "loss": 2.176567554473877, "step": 9840 }, { "epoch": 0.31244444444444447, "grad_norm": 0.095703125, "learning_rate": 0.1, "loss": 2.1811344623565674, "step": 9842 }, { "epoch": 0.3125079365079365, "grad_norm": 0.06689453125, "learning_rate": 0.1, "loss": 2.159538507461548, "step": 9844 }, { "epoch": 0.31257142857142856, "grad_norm": 0.2021484375, "learning_rate": 0.1, "loss": 2.1890382766723633, "step": 9846 }, { "epoch": 0.31263492063492065, "grad_norm": 0.205078125, "learning_rate": 0.1, "loss": 2.191279649734497, "step": 9848 }, { "epoch": 0.3126984126984127, "grad_norm": 0.1650390625, "learning_rate": 0.1, "loss": 2.1829254627227783, "step": 9850 }, { "epoch": 0.31276190476190474, "grad_norm": 0.0791015625, "learning_rate": 0.1, "loss": 2.1959803104400635, "step": 9852 }, { "epoch": 0.31282539682539684, "grad_norm": 0.1611328125, "learning_rate": 0.1, "loss": 2.1675479412078857, "step": 9854 }, { "epoch": 0.3128888888888889, "grad_norm": 0.1474609375, "learning_rate": 0.1, "loss": 2.172224521636963, "step": 9856 }, { "epoch": 0.3129523809523809, "grad_norm": 0.1083984375, "learning_rate": 0.1, "loss": 2.199164390563965, "step": 9858 }, { "epoch": 0.313015873015873, "grad_norm": 0.119140625, "learning_rate": 0.1, "loss": 2.2071785926818848, "step": 9860 }, { "epoch": 0.31307936507936507, "grad_norm": 0.27734375, "learning_rate": 0.1, "loss": 2.198953628540039, "step": 9862 }, { "epoch": 0.31314285714285717, "grad_norm": 0.28515625, "learning_rate": 0.1, "loss": 2.1639485359191895, "step": 9864 }, { "epoch": 0.3132063492063492, "grad_norm": 0.1435546875, "learning_rate": 0.1, "loss": 2.1723406314849854, "step": 9866 }, { "epoch": 0.31326984126984125, "grad_norm": 0.1591796875, "learning_rate": 0.1, "loss": 2.1850998401641846, "step": 9868 }, { "epoch": 0.31333333333333335, "grad_norm": 0.18359375, "learning_rate": 0.1, "loss": 2.2024178504943848, "step": 9870 }, { "epoch": 0.3133968253968254, "grad_norm": 0.330078125, "learning_rate": 0.1, "loss": 2.2177889347076416, "step": 9872 }, { "epoch": 0.31346031746031744, "grad_norm": 0.07861328125, "learning_rate": 0.1, "loss": 2.2047946453094482, "step": 9874 }, { "epoch": 0.31352380952380954, "grad_norm": 0.06201171875, "learning_rate": 0.1, "loss": 2.1921725273132324, "step": 9876 }, { "epoch": 0.3135873015873016, "grad_norm": 0.04931640625, "learning_rate": 0.1, "loss": 2.179666757583618, "step": 9878 }, { "epoch": 0.3136507936507936, "grad_norm": 0.08935546875, "learning_rate": 0.1, "loss": 2.1968069076538086, "step": 9880 }, { "epoch": 0.3137142857142857, "grad_norm": 0.234375, "learning_rate": 0.1, "loss": 2.215252161026001, "step": 9882 }, { "epoch": 0.31377777777777777, "grad_norm": 0.2421875, "learning_rate": 0.1, "loss": 2.207919120788574, "step": 9884 }, { "epoch": 0.31384126984126987, "grad_norm": 0.353515625, "learning_rate": 0.1, "loss": 2.192678928375244, "step": 9886 }, { "epoch": 0.3139047619047619, "grad_norm": 0.125, "learning_rate": 0.1, "loss": 2.178010940551758, "step": 9888 }, { "epoch": 0.31396825396825395, "grad_norm": 0.1484375, "learning_rate": 0.1, "loss": 2.195068836212158, "step": 9890 }, { "epoch": 0.31403174603174605, "grad_norm": 0.2021484375, "learning_rate": 0.1, "loss": 2.2022101879119873, "step": 9892 }, { "epoch": 0.3140952380952381, "grad_norm": 0.1689453125, "learning_rate": 0.1, "loss": 2.188624858856201, "step": 9894 }, { "epoch": 0.31415873015873014, "grad_norm": 0.0654296875, "learning_rate": 0.1, "loss": 2.174272060394287, "step": 9896 }, { "epoch": 0.31422222222222224, "grad_norm": 0.12109375, "learning_rate": 0.1, "loss": 2.1800880432128906, "step": 9898 }, { "epoch": 0.3142857142857143, "grad_norm": 0.2041015625, "learning_rate": 0.1, "loss": 2.182217597961426, "step": 9900 }, { "epoch": 0.3143492063492063, "grad_norm": 0.1640625, "learning_rate": 0.1, "loss": 2.175539255142212, "step": 9902 }, { "epoch": 0.3144126984126984, "grad_norm": 0.1826171875, "learning_rate": 0.1, "loss": 2.1903021335601807, "step": 9904 }, { "epoch": 0.31447619047619046, "grad_norm": 0.16015625, "learning_rate": 0.1, "loss": 2.197434663772583, "step": 9906 }, { "epoch": 0.31453968253968256, "grad_norm": 0.08447265625, "learning_rate": 0.1, "loss": 2.198740005493164, "step": 9908 }, { "epoch": 0.3146031746031746, "grad_norm": 0.0908203125, "learning_rate": 0.1, "loss": 2.165989398956299, "step": 9910 }, { "epoch": 0.31466666666666665, "grad_norm": 0.201171875, "learning_rate": 0.1, "loss": 2.22456693649292, "step": 9912 }, { "epoch": 0.31473015873015875, "grad_norm": 0.33203125, "learning_rate": 0.1, "loss": 2.1924569606781006, "step": 9914 }, { "epoch": 0.3147936507936508, "grad_norm": 0.1806640625, "learning_rate": 0.1, "loss": 2.1935088634490967, "step": 9916 }, { "epoch": 0.31485714285714284, "grad_norm": 0.1044921875, "learning_rate": 0.1, "loss": 2.222795248031616, "step": 9918 }, { "epoch": 0.31492063492063493, "grad_norm": 0.058349609375, "learning_rate": 0.1, "loss": 2.2171945571899414, "step": 9920 }, { "epoch": 0.314984126984127, "grad_norm": 0.1416015625, "learning_rate": 0.1, "loss": 2.2098042964935303, "step": 9922 }, { "epoch": 0.315047619047619, "grad_norm": 0.140625, "learning_rate": 0.1, "loss": 2.1736574172973633, "step": 9924 }, { "epoch": 0.3151111111111111, "grad_norm": 0.267578125, "learning_rate": 0.1, "loss": 2.203256607055664, "step": 9926 }, { "epoch": 0.31517460317460316, "grad_norm": 0.427734375, "learning_rate": 0.1, "loss": 2.196739912033081, "step": 9928 }, { "epoch": 0.31523809523809526, "grad_norm": 0.1005859375, "learning_rate": 0.1, "loss": 2.202354669570923, "step": 9930 }, { "epoch": 0.3153015873015873, "grad_norm": 0.1298828125, "learning_rate": 0.1, "loss": 2.1933236122131348, "step": 9932 }, { "epoch": 0.31536507936507935, "grad_norm": 0.091796875, "learning_rate": 0.1, "loss": 2.198293685913086, "step": 9934 }, { "epoch": 0.31542857142857145, "grad_norm": 0.1865234375, "learning_rate": 0.1, "loss": 2.212238311767578, "step": 9936 }, { "epoch": 0.3154920634920635, "grad_norm": 0.26953125, "learning_rate": 0.1, "loss": 2.1917948722839355, "step": 9938 }, { "epoch": 0.31555555555555553, "grad_norm": 0.1669921875, "learning_rate": 0.1, "loss": 2.187749147415161, "step": 9940 }, { "epoch": 0.31561904761904763, "grad_norm": 0.08154296875, "learning_rate": 0.1, "loss": 2.2057549953460693, "step": 9942 }, { "epoch": 0.3156825396825397, "grad_norm": 0.19140625, "learning_rate": 0.1, "loss": 2.1741790771484375, "step": 9944 }, { "epoch": 0.3157460317460317, "grad_norm": 0.109375, "learning_rate": 0.1, "loss": 2.224497079849243, "step": 9946 }, { "epoch": 0.3158095238095238, "grad_norm": 0.09912109375, "learning_rate": 0.1, "loss": 2.203294277191162, "step": 9948 }, { "epoch": 0.31587301587301586, "grad_norm": 0.08251953125, "learning_rate": 0.1, "loss": 2.196547269821167, "step": 9950 }, { "epoch": 0.31593650793650796, "grad_norm": 0.138671875, "learning_rate": 0.1, "loss": 2.216204881668091, "step": 9952 }, { "epoch": 0.316, "grad_norm": 0.283203125, "learning_rate": 0.1, "loss": 2.2269957065582275, "step": 9954 }, { "epoch": 0.31606349206349205, "grad_norm": 0.2314453125, "learning_rate": 0.1, "loss": 2.209287643432617, "step": 9956 }, { "epoch": 0.31612698412698415, "grad_norm": 0.119140625, "learning_rate": 0.1, "loss": 2.233351945877075, "step": 9958 }, { "epoch": 0.3161904761904762, "grad_norm": 0.201171875, "learning_rate": 0.1, "loss": 2.187995195388794, "step": 9960 }, { "epoch": 0.31625396825396823, "grad_norm": 0.234375, "learning_rate": 0.1, "loss": 2.203009605407715, "step": 9962 }, { "epoch": 0.31631746031746033, "grad_norm": 0.0703125, "learning_rate": 0.1, "loss": 2.1894705295562744, "step": 9964 }, { "epoch": 0.3163809523809524, "grad_norm": 0.09033203125, "learning_rate": 0.1, "loss": 2.2359108924865723, "step": 9966 }, { "epoch": 0.3164444444444444, "grad_norm": 0.1494140625, "learning_rate": 0.1, "loss": 2.202052593231201, "step": 9968 }, { "epoch": 0.3165079365079365, "grad_norm": 0.0908203125, "learning_rate": 0.1, "loss": 2.155691623687744, "step": 9970 }, { "epoch": 0.31657142857142856, "grad_norm": 0.189453125, "learning_rate": 0.1, "loss": 2.2047054767608643, "step": 9972 }, { "epoch": 0.31663492063492066, "grad_norm": 0.296875, "learning_rate": 0.1, "loss": 2.1690495014190674, "step": 9974 }, { "epoch": 0.3166984126984127, "grad_norm": 0.09814453125, "learning_rate": 0.1, "loss": 2.1789681911468506, "step": 9976 }, { "epoch": 0.31676190476190474, "grad_norm": 0.212890625, "learning_rate": 0.1, "loss": 2.19167423248291, "step": 9978 }, { "epoch": 0.31682539682539684, "grad_norm": 0.091796875, "learning_rate": 0.1, "loss": 2.178757667541504, "step": 9980 }, { "epoch": 0.3168888888888889, "grad_norm": 0.091796875, "learning_rate": 0.1, "loss": 2.184861183166504, "step": 9982 }, { "epoch": 0.31695238095238093, "grad_norm": 0.1494140625, "learning_rate": 0.1, "loss": 2.141266345977783, "step": 9984 }, { "epoch": 0.31701587301587303, "grad_norm": 0.33203125, "learning_rate": 0.1, "loss": 2.199850559234619, "step": 9986 }, { "epoch": 0.31707936507936507, "grad_norm": 0.2578125, "learning_rate": 0.1, "loss": 2.2000479698181152, "step": 9988 }, { "epoch": 0.3171428571428571, "grad_norm": 0.1396484375, "learning_rate": 0.1, "loss": 2.1777122020721436, "step": 9990 }, { "epoch": 0.3172063492063492, "grad_norm": 0.1708984375, "learning_rate": 0.1, "loss": 2.1901791095733643, "step": 9992 }, { "epoch": 0.31726984126984126, "grad_norm": 0.078125, "learning_rate": 0.1, "loss": 2.194085121154785, "step": 9994 }, { "epoch": 0.31733333333333336, "grad_norm": 0.09033203125, "learning_rate": 0.1, "loss": 2.1851909160614014, "step": 9996 }, { "epoch": 0.3173968253968254, "grad_norm": 0.1875, "learning_rate": 0.1, "loss": 2.190678358078003, "step": 9998 }, { "epoch": 0.31746031746031744, "grad_norm": 0.25390625, "learning_rate": 0.1, "loss": 2.172008752822876, "step": 10000 }, { "epoch": 0.31752380952380954, "grad_norm": 0.0634765625, "learning_rate": 0.1, "loss": 2.1945793628692627, "step": 10002 }, { "epoch": 0.3175873015873016, "grad_norm": 0.1435546875, "learning_rate": 0.1, "loss": 2.173008680343628, "step": 10004 }, { "epoch": 0.31765079365079363, "grad_norm": 0.255859375, "learning_rate": 0.1, "loss": 2.1987736225128174, "step": 10006 }, { "epoch": 0.3177142857142857, "grad_norm": 0.306640625, "learning_rate": 0.1, "loss": 2.194270610809326, "step": 10008 }, { "epoch": 0.31777777777777777, "grad_norm": 0.07958984375, "learning_rate": 0.1, "loss": 2.194709300994873, "step": 10010 }, { "epoch": 0.31784126984126987, "grad_norm": 0.1630859375, "learning_rate": 0.1, "loss": 2.1713898181915283, "step": 10012 }, { "epoch": 0.3179047619047619, "grad_norm": 0.091796875, "learning_rate": 0.1, "loss": 2.1831634044647217, "step": 10014 }, { "epoch": 0.31796825396825396, "grad_norm": 0.11279296875, "learning_rate": 0.1, "loss": 2.1948635578155518, "step": 10016 }, { "epoch": 0.31803174603174605, "grad_norm": 0.1123046875, "learning_rate": 0.1, "loss": 2.1673130989074707, "step": 10018 }, { "epoch": 0.3180952380952381, "grad_norm": 0.15625, "learning_rate": 0.1, "loss": 2.1914515495300293, "step": 10020 }, { "epoch": 0.31815873015873014, "grad_norm": 0.330078125, "learning_rate": 0.1, "loss": 2.1910383701324463, "step": 10022 }, { "epoch": 0.31822222222222224, "grad_norm": 0.1259765625, "learning_rate": 0.1, "loss": 2.155423402786255, "step": 10024 }, { "epoch": 0.3182857142857143, "grad_norm": 0.130859375, "learning_rate": 0.1, "loss": 2.1624786853790283, "step": 10026 }, { "epoch": 0.3183492063492063, "grad_norm": 0.07421875, "learning_rate": 0.1, "loss": 2.1976842880249023, "step": 10028 }, { "epoch": 0.3184126984126984, "grad_norm": 0.248046875, "learning_rate": 0.1, "loss": 2.1654818058013916, "step": 10030 }, { "epoch": 0.31847619047619047, "grad_norm": 0.2275390625, "learning_rate": 0.1, "loss": 2.1663379669189453, "step": 10032 }, { "epoch": 0.31853968253968257, "grad_norm": 0.08154296875, "learning_rate": 0.1, "loss": 2.1839845180511475, "step": 10034 }, { "epoch": 0.3186031746031746, "grad_norm": 0.1572265625, "learning_rate": 0.1, "loss": 2.1859796047210693, "step": 10036 }, { "epoch": 0.31866666666666665, "grad_norm": 0.408203125, "learning_rate": 0.1, "loss": 2.171574354171753, "step": 10038 }, { "epoch": 0.31873015873015875, "grad_norm": 0.09228515625, "learning_rate": 0.1, "loss": 2.1888623237609863, "step": 10040 }, { "epoch": 0.3187936507936508, "grad_norm": 0.287109375, "learning_rate": 0.1, "loss": 2.2281229496002197, "step": 10042 }, { "epoch": 0.31885714285714284, "grad_norm": 0.06591796875, "learning_rate": 0.1, "loss": 2.215243339538574, "step": 10044 }, { "epoch": 0.31892063492063494, "grad_norm": 0.11767578125, "learning_rate": 0.1, "loss": 2.1814558506011963, "step": 10046 }, { "epoch": 0.318984126984127, "grad_norm": 0.10986328125, "learning_rate": 0.1, "loss": 2.185574769973755, "step": 10048 }, { "epoch": 0.319047619047619, "grad_norm": 0.048583984375, "learning_rate": 0.1, "loss": 2.209963321685791, "step": 10050 }, { "epoch": 0.3191111111111111, "grad_norm": 0.173828125, "learning_rate": 0.1, "loss": 2.1775128841400146, "step": 10052 }, { "epoch": 0.31917460317460317, "grad_norm": 0.171875, "learning_rate": 0.1, "loss": 2.1951544284820557, "step": 10054 }, { "epoch": 0.31923809523809527, "grad_norm": 0.08203125, "learning_rate": 0.1, "loss": 2.2017500400543213, "step": 10056 }, { "epoch": 0.3193015873015873, "grad_norm": 0.064453125, "learning_rate": 0.1, "loss": 2.179600238800049, "step": 10058 }, { "epoch": 0.31936507936507935, "grad_norm": 0.1865234375, "learning_rate": 0.1, "loss": 2.2006568908691406, "step": 10060 }, { "epoch": 0.31942857142857145, "grad_norm": 0.458984375, "learning_rate": 0.1, "loss": 2.195237398147583, "step": 10062 }, { "epoch": 0.3194920634920635, "grad_norm": 0.05029296875, "learning_rate": 0.1, "loss": 2.193232536315918, "step": 10064 }, { "epoch": 0.31955555555555554, "grad_norm": 0.09130859375, "learning_rate": 0.1, "loss": 2.1862893104553223, "step": 10066 }, { "epoch": 0.31961904761904764, "grad_norm": 0.1318359375, "learning_rate": 0.1, "loss": 2.2037243843078613, "step": 10068 }, { "epoch": 0.3196825396825397, "grad_norm": 0.1328125, "learning_rate": 0.1, "loss": 2.184126615524292, "step": 10070 }, { "epoch": 0.3197460317460317, "grad_norm": 0.1865234375, "learning_rate": 0.1, "loss": 2.2134101390838623, "step": 10072 }, { "epoch": 0.3198095238095238, "grad_norm": 0.083984375, "learning_rate": 0.1, "loss": 2.183708667755127, "step": 10074 }, { "epoch": 0.31987301587301586, "grad_norm": 0.06494140625, "learning_rate": 0.1, "loss": 2.20131516456604, "step": 10076 }, { "epoch": 0.31993650793650796, "grad_norm": 0.05322265625, "learning_rate": 0.1, "loss": 2.1745753288269043, "step": 10078 }, { "epoch": 0.32, "grad_norm": 0.08349609375, "learning_rate": 0.1, "loss": 2.191664218902588, "step": 10080 }, { "epoch": 0.32006349206349205, "grad_norm": 0.1298828125, "learning_rate": 0.1, "loss": 2.1834492683410645, "step": 10082 }, { "epoch": 0.32012698412698415, "grad_norm": 0.494140625, "learning_rate": 0.1, "loss": 2.172041177749634, "step": 10084 }, { "epoch": 0.3201904761904762, "grad_norm": 0.1044921875, "learning_rate": 0.1, "loss": 2.1934311389923096, "step": 10086 }, { "epoch": 0.32025396825396824, "grad_norm": 0.1552734375, "learning_rate": 0.1, "loss": 2.1847429275512695, "step": 10088 }, { "epoch": 0.32031746031746033, "grad_norm": 0.12158203125, "learning_rate": 0.1, "loss": 2.155864953994751, "step": 10090 }, { "epoch": 0.3203809523809524, "grad_norm": 0.326171875, "learning_rate": 0.1, "loss": 2.1724495887756348, "step": 10092 }, { "epoch": 0.3204444444444444, "grad_norm": 0.205078125, "learning_rate": 0.1, "loss": 2.2062244415283203, "step": 10094 }, { "epoch": 0.3205079365079365, "grad_norm": 0.1328125, "learning_rate": 0.1, "loss": 2.2282190322875977, "step": 10096 }, { "epoch": 0.32057142857142856, "grad_norm": 0.0947265625, "learning_rate": 0.1, "loss": 2.2121002674102783, "step": 10098 }, { "epoch": 0.32063492063492066, "grad_norm": 0.048583984375, "learning_rate": 0.1, "loss": 2.1866230964660645, "step": 10100 }, { "epoch": 0.3206984126984127, "grad_norm": 0.08984375, "learning_rate": 0.1, "loss": 2.18176007270813, "step": 10102 }, { "epoch": 0.32076190476190475, "grad_norm": 0.1396484375, "learning_rate": 0.1, "loss": 2.1906490325927734, "step": 10104 }, { "epoch": 0.32082539682539685, "grad_norm": 0.05126953125, "learning_rate": 0.1, "loss": 2.194465398788452, "step": 10106 }, { "epoch": 0.3208888888888889, "grad_norm": 0.146484375, "learning_rate": 0.1, "loss": 2.2487235069274902, "step": 10108 }, { "epoch": 0.32095238095238093, "grad_norm": 0.20703125, "learning_rate": 0.1, "loss": 2.2107770442962646, "step": 10110 }, { "epoch": 0.32101587301587303, "grad_norm": 0.1572265625, "learning_rate": 0.1, "loss": 2.204759359359741, "step": 10112 }, { "epoch": 0.3210793650793651, "grad_norm": 0.1845703125, "learning_rate": 0.1, "loss": 2.205735683441162, "step": 10114 }, { "epoch": 0.3211428571428571, "grad_norm": 0.1240234375, "learning_rate": 0.1, "loss": 2.1723439693450928, "step": 10116 }, { "epoch": 0.3212063492063492, "grad_norm": 0.1494140625, "learning_rate": 0.1, "loss": 2.206892728805542, "step": 10118 }, { "epoch": 0.32126984126984126, "grad_norm": 0.07958984375, "learning_rate": 0.1, "loss": 2.1985116004943848, "step": 10120 }, { "epoch": 0.32133333333333336, "grad_norm": 0.1728515625, "learning_rate": 0.1, "loss": 2.1989524364471436, "step": 10122 }, { "epoch": 0.3213968253968254, "grad_norm": 0.263671875, "learning_rate": 0.1, "loss": 2.2023370265960693, "step": 10124 }, { "epoch": 0.32146031746031745, "grad_norm": 0.08154296875, "learning_rate": 0.1, "loss": 2.169569253921509, "step": 10126 }, { "epoch": 0.32152380952380955, "grad_norm": 0.2470703125, "learning_rate": 0.1, "loss": 2.2034926414489746, "step": 10128 }, { "epoch": 0.3215873015873016, "grad_norm": 0.1591796875, "learning_rate": 0.1, "loss": 2.1975016593933105, "step": 10130 }, { "epoch": 0.32165079365079363, "grad_norm": 0.10400390625, "learning_rate": 0.1, "loss": 2.2275421619415283, "step": 10132 }, { "epoch": 0.32171428571428573, "grad_norm": 0.205078125, "learning_rate": 0.1, "loss": 2.1854729652404785, "step": 10134 }, { "epoch": 0.3217777777777778, "grad_norm": 0.255859375, "learning_rate": 0.1, "loss": 2.175104856491089, "step": 10136 }, { "epoch": 0.3218412698412698, "grad_norm": 0.119140625, "learning_rate": 0.1, "loss": 2.2116458415985107, "step": 10138 }, { "epoch": 0.3219047619047619, "grad_norm": 0.1279296875, "learning_rate": 0.1, "loss": 2.2148518562316895, "step": 10140 }, { "epoch": 0.32196825396825396, "grad_norm": 0.2421875, "learning_rate": 0.1, "loss": 2.218768835067749, "step": 10142 }, { "epoch": 0.32203174603174606, "grad_norm": 0.10205078125, "learning_rate": 0.1, "loss": 2.222781181335449, "step": 10144 }, { "epoch": 0.3220952380952381, "grad_norm": 0.146484375, "learning_rate": 0.1, "loss": 2.1772725582122803, "step": 10146 }, { "epoch": 0.32215873015873014, "grad_norm": 0.1787109375, "learning_rate": 0.1, "loss": 2.1994969844818115, "step": 10148 }, { "epoch": 0.32222222222222224, "grad_norm": 0.1513671875, "learning_rate": 0.1, "loss": 2.188096523284912, "step": 10150 }, { "epoch": 0.3222857142857143, "grad_norm": 0.15625, "learning_rate": 0.1, "loss": 2.181978464126587, "step": 10152 }, { "epoch": 0.32234920634920633, "grad_norm": 0.1435546875, "learning_rate": 0.1, "loss": 2.222205638885498, "step": 10154 }, { "epoch": 0.32241269841269843, "grad_norm": 0.07275390625, "learning_rate": 0.1, "loss": 2.193004846572876, "step": 10156 }, { "epoch": 0.32247619047619047, "grad_norm": 0.248046875, "learning_rate": 0.1, "loss": 2.19010853767395, "step": 10158 }, { "epoch": 0.3225396825396825, "grad_norm": 0.314453125, "learning_rate": 0.1, "loss": 2.227566719055176, "step": 10160 }, { "epoch": 0.3226031746031746, "grad_norm": 0.0908203125, "learning_rate": 0.1, "loss": 2.1928999423980713, "step": 10162 }, { "epoch": 0.32266666666666666, "grad_norm": 0.1025390625, "learning_rate": 0.1, "loss": 2.1987497806549072, "step": 10164 }, { "epoch": 0.32273015873015876, "grad_norm": 0.1298828125, "learning_rate": 0.1, "loss": 2.19431734085083, "step": 10166 }, { "epoch": 0.3227936507936508, "grad_norm": 0.138671875, "learning_rate": 0.1, "loss": 2.181415557861328, "step": 10168 }, { "epoch": 0.32285714285714284, "grad_norm": 0.244140625, "learning_rate": 0.1, "loss": 2.199418306350708, "step": 10170 }, { "epoch": 0.32292063492063494, "grad_norm": 0.283203125, "learning_rate": 0.1, "loss": 2.1994881629943848, "step": 10172 }, { "epoch": 0.322984126984127, "grad_norm": 0.1875, "learning_rate": 0.1, "loss": 2.1669154167175293, "step": 10174 }, { "epoch": 0.32304761904761903, "grad_norm": 0.1083984375, "learning_rate": 0.1, "loss": 2.185619831085205, "step": 10176 }, { "epoch": 0.3231111111111111, "grad_norm": 0.08251953125, "learning_rate": 0.1, "loss": 2.2202422618865967, "step": 10178 }, { "epoch": 0.32317460317460317, "grad_norm": 0.349609375, "learning_rate": 0.1, "loss": 2.192392587661743, "step": 10180 }, { "epoch": 0.3232380952380952, "grad_norm": 0.4765625, "learning_rate": 0.1, "loss": 2.2007176876068115, "step": 10182 }, { "epoch": 0.3233015873015873, "grad_norm": 0.111328125, "learning_rate": 0.1, "loss": 2.178288698196411, "step": 10184 }, { "epoch": 0.32336507936507936, "grad_norm": 0.06640625, "learning_rate": 0.1, "loss": 2.1995842456817627, "step": 10186 }, { "epoch": 0.32342857142857145, "grad_norm": 0.1689453125, "learning_rate": 0.1, "loss": 2.1904914379119873, "step": 10188 }, { "epoch": 0.3234920634920635, "grad_norm": 0.330078125, "learning_rate": 0.1, "loss": 2.199918031692505, "step": 10190 }, { "epoch": 0.32355555555555554, "grad_norm": 0.06689453125, "learning_rate": 0.1, "loss": 2.201245069503784, "step": 10192 }, { "epoch": 0.32361904761904764, "grad_norm": 0.07958984375, "learning_rate": 0.1, "loss": 2.174264907836914, "step": 10194 }, { "epoch": 0.3236825396825397, "grad_norm": 0.1396484375, "learning_rate": 0.1, "loss": 2.214263677597046, "step": 10196 }, { "epoch": 0.3237460317460317, "grad_norm": 0.12890625, "learning_rate": 0.1, "loss": 2.1534924507141113, "step": 10198 }, { "epoch": 0.3238095238095238, "grad_norm": 0.0771484375, "learning_rate": 0.1, "loss": 2.189829111099243, "step": 10200 }, { "epoch": 0.32387301587301587, "grad_norm": 0.197265625, "learning_rate": 0.1, "loss": 2.1942930221557617, "step": 10202 }, { "epoch": 0.3239365079365079, "grad_norm": 0.1455078125, "learning_rate": 0.1, "loss": 2.202707529067993, "step": 10204 }, { "epoch": 0.324, "grad_norm": 0.052978515625, "learning_rate": 0.1, "loss": 2.1914288997650146, "step": 10206 }, { "epoch": 0.32406349206349205, "grad_norm": 0.07763671875, "learning_rate": 0.1, "loss": 2.188530683517456, "step": 10208 }, { "epoch": 0.32412698412698415, "grad_norm": 0.2021484375, "learning_rate": 0.1, "loss": 2.2075953483581543, "step": 10210 }, { "epoch": 0.3241904761904762, "grad_norm": 0.19140625, "learning_rate": 0.1, "loss": 2.162674903869629, "step": 10212 }, { "epoch": 0.32425396825396824, "grad_norm": 0.177734375, "learning_rate": 0.1, "loss": 2.169445276260376, "step": 10214 }, { "epoch": 0.32431746031746034, "grad_norm": 0.25390625, "learning_rate": 0.1, "loss": 2.1930174827575684, "step": 10216 }, { "epoch": 0.3243809523809524, "grad_norm": 0.181640625, "learning_rate": 0.1, "loss": 2.1908702850341797, "step": 10218 }, { "epoch": 0.3244444444444444, "grad_norm": 0.06298828125, "learning_rate": 0.1, "loss": 2.209510087966919, "step": 10220 }, { "epoch": 0.3245079365079365, "grad_norm": 0.1318359375, "learning_rate": 0.1, "loss": 2.187349796295166, "step": 10222 }, { "epoch": 0.32457142857142857, "grad_norm": 0.291015625, "learning_rate": 0.1, "loss": 2.1916568279266357, "step": 10224 }, { "epoch": 0.3246349206349206, "grad_norm": 0.12890625, "learning_rate": 0.1, "loss": 2.1928958892822266, "step": 10226 }, { "epoch": 0.3246984126984127, "grad_norm": 0.07666015625, "learning_rate": 0.1, "loss": 2.1768198013305664, "step": 10228 }, { "epoch": 0.32476190476190475, "grad_norm": 0.1484375, "learning_rate": 0.1, "loss": 2.165881872177124, "step": 10230 }, { "epoch": 0.32482539682539685, "grad_norm": 0.33203125, "learning_rate": 0.1, "loss": 2.2046031951904297, "step": 10232 }, { "epoch": 0.3248888888888889, "grad_norm": 0.11279296875, "learning_rate": 0.1, "loss": 2.1938626766204834, "step": 10234 }, { "epoch": 0.32495238095238094, "grad_norm": 0.09228515625, "learning_rate": 0.1, "loss": 2.2038733959198, "step": 10236 }, { "epoch": 0.32501587301587304, "grad_norm": 0.056396484375, "learning_rate": 0.1, "loss": 2.133981704711914, "step": 10238 }, { "epoch": 0.3250793650793651, "grad_norm": 0.1201171875, "learning_rate": 0.1, "loss": 2.190983533859253, "step": 10240 }, { "epoch": 0.3251428571428571, "grad_norm": 0.050537109375, "learning_rate": 0.1, "loss": 2.2191505432128906, "step": 10242 }, { "epoch": 0.3252063492063492, "grad_norm": 0.1494140625, "learning_rate": 0.1, "loss": 2.211355447769165, "step": 10244 }, { "epoch": 0.32526984126984126, "grad_norm": 0.146484375, "learning_rate": 0.1, "loss": 2.2092180252075195, "step": 10246 }, { "epoch": 0.3253333333333333, "grad_norm": 0.1025390625, "learning_rate": 0.1, "loss": 2.1866648197174072, "step": 10248 }, { "epoch": 0.3253968253968254, "grad_norm": 0.11767578125, "learning_rate": 0.1, "loss": 2.207821846008301, "step": 10250 }, { "epoch": 0.32546031746031745, "grad_norm": 0.404296875, "learning_rate": 0.1, "loss": 2.2078616619110107, "step": 10252 }, { "epoch": 0.32552380952380955, "grad_norm": 0.1708984375, "learning_rate": 0.1, "loss": 2.216735601425171, "step": 10254 }, { "epoch": 0.3255873015873016, "grad_norm": 0.2451171875, "learning_rate": 0.1, "loss": 2.234604597091675, "step": 10256 }, { "epoch": 0.32565079365079364, "grad_norm": 0.1982421875, "learning_rate": 0.1, "loss": 2.197223663330078, "step": 10258 }, { "epoch": 0.32571428571428573, "grad_norm": 0.150390625, "learning_rate": 0.1, "loss": 2.2022809982299805, "step": 10260 }, { "epoch": 0.3257777777777778, "grad_norm": 0.1484375, "learning_rate": 0.1, "loss": 2.225069522857666, "step": 10262 }, { "epoch": 0.3258412698412698, "grad_norm": 0.09326171875, "learning_rate": 0.1, "loss": 2.2101097106933594, "step": 10264 }, { "epoch": 0.3259047619047619, "grad_norm": 0.05908203125, "learning_rate": 0.1, "loss": 2.2187862396240234, "step": 10266 }, { "epoch": 0.32596825396825396, "grad_norm": 0.1005859375, "learning_rate": 0.1, "loss": 2.1818199157714844, "step": 10268 }, { "epoch": 0.326031746031746, "grad_norm": 0.05615234375, "learning_rate": 0.1, "loss": 2.1773345470428467, "step": 10270 }, { "epoch": 0.3260952380952381, "grad_norm": 0.08056640625, "learning_rate": 0.1, "loss": 2.225092649459839, "step": 10272 }, { "epoch": 0.32615873015873015, "grad_norm": 0.0712890625, "learning_rate": 0.1, "loss": 2.2210476398468018, "step": 10274 }, { "epoch": 0.32622222222222225, "grad_norm": 0.06591796875, "learning_rate": 0.1, "loss": 2.2137835025787354, "step": 10276 }, { "epoch": 0.3262857142857143, "grad_norm": 0.17578125, "learning_rate": 0.1, "loss": 2.1606078147888184, "step": 10278 }, { "epoch": 0.32634920634920633, "grad_norm": 0.24609375, "learning_rate": 0.1, "loss": 2.225872039794922, "step": 10280 }, { "epoch": 0.32641269841269843, "grad_norm": 0.30859375, "learning_rate": 0.1, "loss": 2.2250897884368896, "step": 10282 }, { "epoch": 0.3264761904761905, "grad_norm": 0.220703125, "learning_rate": 0.1, "loss": 2.2116339206695557, "step": 10284 }, { "epoch": 0.3265396825396825, "grad_norm": 0.2734375, "learning_rate": 0.1, "loss": 2.2435169219970703, "step": 10286 }, { "epoch": 0.3266031746031746, "grad_norm": 0.271484375, "learning_rate": 0.1, "loss": 2.2176313400268555, "step": 10288 }, { "epoch": 0.32666666666666666, "grad_norm": 0.32421875, "learning_rate": 0.1, "loss": 2.1824638843536377, "step": 10290 }, { "epoch": 0.3267301587301587, "grad_norm": 0.049072265625, "learning_rate": 0.1, "loss": 2.1726951599121094, "step": 10292 }, { "epoch": 0.3267936507936508, "grad_norm": 0.06494140625, "learning_rate": 0.1, "loss": 2.182612895965576, "step": 10294 }, { "epoch": 0.32685714285714285, "grad_norm": 0.158203125, "learning_rate": 0.1, "loss": 2.2205424308776855, "step": 10296 }, { "epoch": 0.32692063492063494, "grad_norm": 0.19140625, "learning_rate": 0.1, "loss": 2.226999282836914, "step": 10298 }, { "epoch": 0.326984126984127, "grad_norm": 0.10986328125, "learning_rate": 0.1, "loss": 2.2443318367004395, "step": 10300 }, { "epoch": 0.32704761904761903, "grad_norm": 0.1005859375, "learning_rate": 0.1, "loss": 2.2227349281311035, "step": 10302 }, { "epoch": 0.32711111111111113, "grad_norm": 0.1669921875, "learning_rate": 0.1, "loss": 2.2043514251708984, "step": 10304 }, { "epoch": 0.3271746031746032, "grad_norm": 0.0927734375, "learning_rate": 0.1, "loss": 2.222609758377075, "step": 10306 }, { "epoch": 0.3272380952380952, "grad_norm": 0.11279296875, "learning_rate": 0.1, "loss": 2.261559247970581, "step": 10308 }, { "epoch": 0.3273015873015873, "grad_norm": 0.1513671875, "learning_rate": 0.1, "loss": 2.185516357421875, "step": 10310 }, { "epoch": 0.32736507936507936, "grad_norm": 0.15234375, "learning_rate": 0.1, "loss": 2.1913797855377197, "step": 10312 }, { "epoch": 0.3274285714285714, "grad_norm": 0.1806640625, "learning_rate": 0.1, "loss": 2.2157411575317383, "step": 10314 }, { "epoch": 0.3274920634920635, "grad_norm": 0.255859375, "learning_rate": 0.1, "loss": 2.2107207775115967, "step": 10316 }, { "epoch": 0.32755555555555554, "grad_norm": 0.380859375, "learning_rate": 0.1, "loss": 2.2078819274902344, "step": 10318 }, { "epoch": 0.32761904761904764, "grad_norm": 0.162109375, "learning_rate": 0.1, "loss": 2.245715856552124, "step": 10320 }, { "epoch": 0.3276825396825397, "grad_norm": 0.0830078125, "learning_rate": 0.1, "loss": 2.2328712940216064, "step": 10322 }, { "epoch": 0.32774603174603173, "grad_norm": 0.053466796875, "learning_rate": 0.1, "loss": 2.2355945110321045, "step": 10324 }, { "epoch": 0.32780952380952383, "grad_norm": 0.09716796875, "learning_rate": 0.1, "loss": 2.215332508087158, "step": 10326 }, { "epoch": 0.32787301587301587, "grad_norm": 0.09033203125, "learning_rate": 0.1, "loss": 2.2500150203704834, "step": 10328 }, { "epoch": 0.3279365079365079, "grad_norm": 0.1962890625, "learning_rate": 0.1, "loss": 2.1843559741973877, "step": 10330 }, { "epoch": 0.328, "grad_norm": 0.11669921875, "learning_rate": 0.1, "loss": 2.193450927734375, "step": 10332 }, { "epoch": 0.32806349206349206, "grad_norm": 0.1591796875, "learning_rate": 0.1, "loss": 2.199761390686035, "step": 10334 }, { "epoch": 0.3281269841269841, "grad_norm": 0.1728515625, "learning_rate": 0.1, "loss": 2.1927754878997803, "step": 10336 }, { "epoch": 0.3281904761904762, "grad_norm": 0.140625, "learning_rate": 0.1, "loss": 2.215273141860962, "step": 10338 }, { "epoch": 0.32825396825396824, "grad_norm": 0.17578125, "learning_rate": 0.1, "loss": 2.215738534927368, "step": 10340 }, { "epoch": 0.32831746031746034, "grad_norm": 0.06640625, "learning_rate": 0.1, "loss": 2.206218719482422, "step": 10342 }, { "epoch": 0.3283809523809524, "grad_norm": 0.24609375, "learning_rate": 0.1, "loss": 2.245255947113037, "step": 10344 }, { "epoch": 0.32844444444444443, "grad_norm": 0.42578125, "learning_rate": 0.1, "loss": 2.2155306339263916, "step": 10346 }, { "epoch": 0.3285079365079365, "grad_norm": 0.1826171875, "learning_rate": 0.1, "loss": 2.236671209335327, "step": 10348 }, { "epoch": 0.32857142857142857, "grad_norm": 0.08642578125, "learning_rate": 0.1, "loss": 2.1979265213012695, "step": 10350 }, { "epoch": 0.3286349206349206, "grad_norm": 0.197265625, "learning_rate": 0.1, "loss": 2.222754955291748, "step": 10352 }, { "epoch": 0.3286984126984127, "grad_norm": 0.224609375, "learning_rate": 0.1, "loss": 2.2176032066345215, "step": 10354 }, { "epoch": 0.32876190476190476, "grad_norm": 0.134765625, "learning_rate": 0.1, "loss": 2.2208306789398193, "step": 10356 }, { "epoch": 0.3288253968253968, "grad_norm": 0.0732421875, "learning_rate": 0.1, "loss": 2.272505521774292, "step": 10358 }, { "epoch": 0.3288888888888889, "grad_norm": 0.1015625, "learning_rate": 0.1, "loss": 2.224036455154419, "step": 10360 }, { "epoch": 0.32895238095238094, "grad_norm": 0.2412109375, "learning_rate": 0.1, "loss": 2.2531073093414307, "step": 10362 }, { "epoch": 0.32901587301587304, "grad_norm": 0.072265625, "learning_rate": 0.1, "loss": 2.240610361099243, "step": 10364 }, { "epoch": 0.3290793650793651, "grad_norm": 0.1826171875, "learning_rate": 0.1, "loss": 2.2691969871520996, "step": 10366 }, { "epoch": 0.3291428571428571, "grad_norm": 0.1318359375, "learning_rate": 0.1, "loss": 2.222153425216675, "step": 10368 }, { "epoch": 0.3292063492063492, "grad_norm": 0.091796875, "learning_rate": 0.1, "loss": 2.2308342456817627, "step": 10370 }, { "epoch": 0.32926984126984127, "grad_norm": 0.0927734375, "learning_rate": 0.1, "loss": 2.2369120121002197, "step": 10372 }, { "epoch": 0.3293333333333333, "grad_norm": 0.0810546875, "learning_rate": 0.1, "loss": 2.232517719268799, "step": 10374 }, { "epoch": 0.3293968253968254, "grad_norm": 0.2890625, "learning_rate": 0.1, "loss": 2.216803789138794, "step": 10376 }, { "epoch": 0.32946031746031745, "grad_norm": 0.30859375, "learning_rate": 0.1, "loss": 2.2027909755706787, "step": 10378 }, { "epoch": 0.3295238095238095, "grad_norm": 0.12158203125, "learning_rate": 0.1, "loss": 2.191666841506958, "step": 10380 }, { "epoch": 0.3295873015873016, "grad_norm": 0.1748046875, "learning_rate": 0.1, "loss": 2.2546019554138184, "step": 10382 }, { "epoch": 0.32965079365079364, "grad_norm": 0.26953125, "learning_rate": 0.1, "loss": 2.2449686527252197, "step": 10384 }, { "epoch": 0.32971428571428574, "grad_norm": 0.2412109375, "learning_rate": 0.1, "loss": 2.2395853996276855, "step": 10386 }, { "epoch": 0.3297777777777778, "grad_norm": 0.07080078125, "learning_rate": 0.1, "loss": 2.2436435222625732, "step": 10388 }, { "epoch": 0.3298412698412698, "grad_norm": 0.06103515625, "learning_rate": 0.1, "loss": 2.1957650184631348, "step": 10390 }, { "epoch": 0.3299047619047619, "grad_norm": 0.1044921875, "learning_rate": 0.1, "loss": 2.1989736557006836, "step": 10392 }, { "epoch": 0.32996825396825397, "grad_norm": 0.1748046875, "learning_rate": 0.1, "loss": 2.2154617309570312, "step": 10394 }, { "epoch": 0.330031746031746, "grad_norm": 0.1298828125, "learning_rate": 0.1, "loss": 2.2190306186676025, "step": 10396 }, { "epoch": 0.3300952380952381, "grad_norm": 0.10888671875, "learning_rate": 0.1, "loss": 2.2018134593963623, "step": 10398 }, { "epoch": 0.33015873015873015, "grad_norm": 0.142578125, "learning_rate": 0.1, "loss": 2.2261345386505127, "step": 10400 }, { "epoch": 0.3302222222222222, "grad_norm": 0.1787109375, "learning_rate": 0.1, "loss": 2.2277352809906006, "step": 10402 }, { "epoch": 0.3302857142857143, "grad_norm": 0.142578125, "learning_rate": 0.1, "loss": 2.2359938621520996, "step": 10404 }, { "epoch": 0.33034920634920634, "grad_norm": 0.2265625, "learning_rate": 0.1, "loss": 2.2159276008605957, "step": 10406 }, { "epoch": 0.33041269841269844, "grad_norm": 0.22265625, "learning_rate": 0.1, "loss": 2.2152233123779297, "step": 10408 }, { "epoch": 0.3304761904761905, "grad_norm": 0.17578125, "learning_rate": 0.1, "loss": 2.1989567279815674, "step": 10410 }, { "epoch": 0.3305396825396825, "grad_norm": 0.23046875, "learning_rate": 0.1, "loss": 2.2169737815856934, "step": 10412 }, { "epoch": 0.3306031746031746, "grad_norm": 0.158203125, "learning_rate": 0.1, "loss": 2.2402162551879883, "step": 10414 }, { "epoch": 0.33066666666666666, "grad_norm": 0.09716796875, "learning_rate": 0.1, "loss": 2.22573184967041, "step": 10416 }, { "epoch": 0.3307301587301587, "grad_norm": 0.173828125, "learning_rate": 0.1, "loss": 2.2216873168945312, "step": 10418 }, { "epoch": 0.3307936507936508, "grad_norm": 0.35546875, "learning_rate": 0.1, "loss": 2.22286319732666, "step": 10420 }, { "epoch": 0.33085714285714285, "grad_norm": 0.248046875, "learning_rate": 0.1, "loss": 2.2141366004943848, "step": 10422 }, { "epoch": 0.3309206349206349, "grad_norm": 0.138671875, "learning_rate": 0.1, "loss": 2.207756519317627, "step": 10424 }, { "epoch": 0.330984126984127, "grad_norm": 0.07763671875, "learning_rate": 0.1, "loss": 2.225470781326294, "step": 10426 }, { "epoch": 0.33104761904761904, "grad_norm": 0.1669921875, "learning_rate": 0.1, "loss": 2.228902816772461, "step": 10428 }, { "epoch": 0.33111111111111113, "grad_norm": 0.1533203125, "learning_rate": 0.1, "loss": 2.2184183597564697, "step": 10430 }, { "epoch": 0.3311746031746032, "grad_norm": 0.12158203125, "learning_rate": 0.1, "loss": 2.2254750728607178, "step": 10432 }, { "epoch": 0.3312380952380952, "grad_norm": 0.1767578125, "learning_rate": 0.1, "loss": 2.2171778678894043, "step": 10434 }, { "epoch": 0.3313015873015873, "grad_norm": 0.068359375, "learning_rate": 0.1, "loss": 2.237307071685791, "step": 10436 }, { "epoch": 0.33136507936507936, "grad_norm": 0.1259765625, "learning_rate": 0.1, "loss": 2.2065367698669434, "step": 10438 }, { "epoch": 0.3314285714285714, "grad_norm": 0.09716796875, "learning_rate": 0.1, "loss": 2.2208123207092285, "step": 10440 }, { "epoch": 0.3314920634920635, "grad_norm": 0.16796875, "learning_rate": 0.1, "loss": 2.219244956970215, "step": 10442 }, { "epoch": 0.33155555555555555, "grad_norm": 0.185546875, "learning_rate": 0.1, "loss": 2.2103569507598877, "step": 10444 }, { "epoch": 0.33161904761904765, "grad_norm": 0.1591796875, "learning_rate": 0.1, "loss": 2.2479629516601562, "step": 10446 }, { "epoch": 0.3316825396825397, "grad_norm": 0.1337890625, "learning_rate": 0.1, "loss": 2.2229833602905273, "step": 10448 }, { "epoch": 0.33174603174603173, "grad_norm": 0.546875, "learning_rate": 0.1, "loss": 2.2340261936187744, "step": 10450 }, { "epoch": 0.33180952380952383, "grad_norm": 0.11962890625, "learning_rate": 0.1, "loss": 2.228762626647949, "step": 10452 }, { "epoch": 0.3318730158730159, "grad_norm": 0.12158203125, "learning_rate": 0.1, "loss": 2.2089247703552246, "step": 10454 }, { "epoch": 0.3319365079365079, "grad_norm": 0.07275390625, "learning_rate": 0.1, "loss": 2.214805841445923, "step": 10456 }, { "epoch": 0.332, "grad_norm": 0.111328125, "learning_rate": 0.1, "loss": 2.2390830516815186, "step": 10458 }, { "epoch": 0.33206349206349206, "grad_norm": 0.2080078125, "learning_rate": 0.1, "loss": 2.255481719970703, "step": 10460 }, { "epoch": 0.3321269841269841, "grad_norm": 0.146484375, "learning_rate": 0.1, "loss": 2.19061541557312, "step": 10462 }, { "epoch": 0.3321904761904762, "grad_norm": 0.1259765625, "learning_rate": 0.1, "loss": 2.2185862064361572, "step": 10464 }, { "epoch": 0.33225396825396825, "grad_norm": 0.064453125, "learning_rate": 0.1, "loss": 2.2265207767486572, "step": 10466 }, { "epoch": 0.33231746031746034, "grad_norm": 0.1279296875, "learning_rate": 0.1, "loss": 2.2071709632873535, "step": 10468 }, { "epoch": 0.3323809523809524, "grad_norm": 0.2333984375, "learning_rate": 0.1, "loss": 2.219552993774414, "step": 10470 }, { "epoch": 0.33244444444444443, "grad_norm": 0.15625, "learning_rate": 0.1, "loss": 2.2066922187805176, "step": 10472 }, { "epoch": 0.33250793650793653, "grad_norm": 0.255859375, "learning_rate": 0.1, "loss": 2.1909236907958984, "step": 10474 }, { "epoch": 0.3325714285714286, "grad_norm": 0.2099609375, "learning_rate": 0.1, "loss": 2.212517023086548, "step": 10476 }, { "epoch": 0.3326349206349206, "grad_norm": 0.181640625, "learning_rate": 0.1, "loss": 2.2175776958465576, "step": 10478 }, { "epoch": 0.3326984126984127, "grad_norm": 0.162109375, "learning_rate": 0.1, "loss": 2.1897659301757812, "step": 10480 }, { "epoch": 0.33276190476190476, "grad_norm": 0.07763671875, "learning_rate": 0.1, "loss": 2.197108745574951, "step": 10482 }, { "epoch": 0.3328253968253968, "grad_norm": 0.08544921875, "learning_rate": 0.1, "loss": 2.210545301437378, "step": 10484 }, { "epoch": 0.3328888888888889, "grad_norm": 0.1572265625, "learning_rate": 0.1, "loss": 2.1997792720794678, "step": 10486 }, { "epoch": 0.33295238095238094, "grad_norm": 0.0908203125, "learning_rate": 0.1, "loss": 2.212092638015747, "step": 10488 }, { "epoch": 0.33301587301587304, "grad_norm": 0.08349609375, "learning_rate": 0.1, "loss": 2.193092107772827, "step": 10490 }, { "epoch": 0.3330793650793651, "grad_norm": 0.1669921875, "learning_rate": 0.1, "loss": 2.2050037384033203, "step": 10492 }, { "epoch": 0.33314285714285713, "grad_norm": 0.4140625, "learning_rate": 0.1, "loss": 2.2340750694274902, "step": 10494 }, { "epoch": 0.33320634920634923, "grad_norm": 0.07763671875, "learning_rate": 0.1, "loss": 2.1892526149749756, "step": 10496 }, { "epoch": 0.33326984126984127, "grad_norm": 0.07421875, "learning_rate": 0.1, "loss": 2.1927127838134766, "step": 10498 }, { "epoch": 0.3333333333333333, "grad_norm": 0.2216796875, "learning_rate": 0.1, "loss": 2.200645685195923, "step": 10500 }, { "epoch": 0.3333968253968254, "grad_norm": 0.1279296875, "learning_rate": 0.1, "loss": 2.215970277786255, "step": 10502 }, { "epoch": 0.33346031746031746, "grad_norm": 0.109375, "learning_rate": 0.1, "loss": 2.222879648208618, "step": 10504 }, { "epoch": 0.3335238095238095, "grad_norm": 0.052978515625, "learning_rate": 0.1, "loss": 2.2258737087249756, "step": 10506 }, { "epoch": 0.3335873015873016, "grad_norm": 0.1552734375, "learning_rate": 0.1, "loss": 2.1933491230010986, "step": 10508 }, { "epoch": 0.33365079365079364, "grad_norm": 0.353515625, "learning_rate": 0.1, "loss": 2.189959764480591, "step": 10510 }, { "epoch": 0.33371428571428574, "grad_norm": 0.1943359375, "learning_rate": 0.1, "loss": 2.215662717819214, "step": 10512 }, { "epoch": 0.3337777777777778, "grad_norm": 0.154296875, "learning_rate": 0.1, "loss": 2.2161316871643066, "step": 10514 }, { "epoch": 0.3338412698412698, "grad_norm": 0.158203125, "learning_rate": 0.1, "loss": 2.213809013366699, "step": 10516 }, { "epoch": 0.3339047619047619, "grad_norm": 0.11181640625, "learning_rate": 0.1, "loss": 2.193037509918213, "step": 10518 }, { "epoch": 0.33396825396825397, "grad_norm": 0.04638671875, "learning_rate": 0.1, "loss": 2.168241262435913, "step": 10520 }, { "epoch": 0.334031746031746, "grad_norm": 0.2138671875, "learning_rate": 0.1, "loss": 2.2035601139068604, "step": 10522 }, { "epoch": 0.3340952380952381, "grad_norm": 0.33984375, "learning_rate": 0.1, "loss": 2.1778528690338135, "step": 10524 }, { "epoch": 0.33415873015873016, "grad_norm": 0.1142578125, "learning_rate": 0.1, "loss": 2.15850830078125, "step": 10526 }, { "epoch": 0.3342222222222222, "grad_norm": 0.10791015625, "learning_rate": 0.1, "loss": 2.205479621887207, "step": 10528 }, { "epoch": 0.3342857142857143, "grad_norm": 0.07275390625, "learning_rate": 0.1, "loss": 2.195462703704834, "step": 10530 }, { "epoch": 0.33434920634920634, "grad_norm": 0.09423828125, "learning_rate": 0.1, "loss": 2.1862833499908447, "step": 10532 }, { "epoch": 0.33441269841269844, "grad_norm": 0.1826171875, "learning_rate": 0.1, "loss": 2.1653358936309814, "step": 10534 }, { "epoch": 0.3344761904761905, "grad_norm": 0.1044921875, "learning_rate": 0.1, "loss": 2.187879800796509, "step": 10536 }, { "epoch": 0.3345396825396825, "grad_norm": 0.2392578125, "learning_rate": 0.1, "loss": 2.1676132678985596, "step": 10538 }, { "epoch": 0.3346031746031746, "grad_norm": 0.298828125, "learning_rate": 0.1, "loss": 2.172863006591797, "step": 10540 }, { "epoch": 0.33466666666666667, "grad_norm": 0.2041015625, "learning_rate": 0.1, "loss": 2.221343517303467, "step": 10542 }, { "epoch": 0.3347301587301587, "grad_norm": 0.05615234375, "learning_rate": 0.1, "loss": 2.168297052383423, "step": 10544 }, { "epoch": 0.3347936507936508, "grad_norm": 0.08837890625, "learning_rate": 0.1, "loss": 2.179994821548462, "step": 10546 }, { "epoch": 0.33485714285714285, "grad_norm": 0.107421875, "learning_rate": 0.1, "loss": 2.1946322917938232, "step": 10548 }, { "epoch": 0.3349206349206349, "grad_norm": 0.1630859375, "learning_rate": 0.1, "loss": 2.1987736225128174, "step": 10550 }, { "epoch": 0.334984126984127, "grad_norm": 0.1708984375, "learning_rate": 0.1, "loss": 2.186816453933716, "step": 10552 }, { "epoch": 0.33504761904761904, "grad_norm": 0.060546875, "learning_rate": 0.1, "loss": 2.2010762691497803, "step": 10554 }, { "epoch": 0.33511111111111114, "grad_norm": 0.09912109375, "learning_rate": 0.1, "loss": 2.20300030708313, "step": 10556 }, { "epoch": 0.3351746031746032, "grad_norm": 0.234375, "learning_rate": 0.1, "loss": 2.2044005393981934, "step": 10558 }, { "epoch": 0.3352380952380952, "grad_norm": 0.173828125, "learning_rate": 0.1, "loss": 2.177442789077759, "step": 10560 }, { "epoch": 0.3353015873015873, "grad_norm": 0.1875, "learning_rate": 0.1, "loss": 2.1997268199920654, "step": 10562 }, { "epoch": 0.33536507936507937, "grad_norm": 0.54296875, "learning_rate": 0.1, "loss": 2.177321434020996, "step": 10564 }, { "epoch": 0.3354285714285714, "grad_norm": 0.09521484375, "learning_rate": 0.1, "loss": 2.1977639198303223, "step": 10566 }, { "epoch": 0.3354920634920635, "grad_norm": 0.06689453125, "learning_rate": 0.1, "loss": 2.186274528503418, "step": 10568 }, { "epoch": 0.33555555555555555, "grad_norm": 0.154296875, "learning_rate": 0.1, "loss": 2.178790807723999, "step": 10570 }, { "epoch": 0.3356190476190476, "grad_norm": 0.11474609375, "learning_rate": 0.1, "loss": 2.1916494369506836, "step": 10572 }, { "epoch": 0.3356825396825397, "grad_norm": 0.15625, "learning_rate": 0.1, "loss": 2.198308229446411, "step": 10574 }, { "epoch": 0.33574603174603174, "grad_norm": 0.1748046875, "learning_rate": 0.1, "loss": 2.1767139434814453, "step": 10576 }, { "epoch": 0.33580952380952384, "grad_norm": 0.2138671875, "learning_rate": 0.1, "loss": 2.189945936203003, "step": 10578 }, { "epoch": 0.3358730158730159, "grad_norm": 0.15625, "learning_rate": 0.1, "loss": 2.1869962215423584, "step": 10580 }, { "epoch": 0.3359365079365079, "grad_norm": 0.181640625, "learning_rate": 0.1, "loss": 2.1500842571258545, "step": 10582 }, { "epoch": 0.336, "grad_norm": 0.10009765625, "learning_rate": 0.1, "loss": 2.1756904125213623, "step": 10584 }, { "epoch": 0.33606349206349206, "grad_norm": 0.1083984375, "learning_rate": 0.1, "loss": 2.1680054664611816, "step": 10586 }, { "epoch": 0.3361269841269841, "grad_norm": 0.216796875, "learning_rate": 0.1, "loss": 2.182795763015747, "step": 10588 }, { "epoch": 0.3361904761904762, "grad_norm": 0.177734375, "learning_rate": 0.1, "loss": 2.187570571899414, "step": 10590 }, { "epoch": 0.33625396825396825, "grad_norm": 0.216796875, "learning_rate": 0.1, "loss": 2.186627149581909, "step": 10592 }, { "epoch": 0.3363174603174603, "grad_norm": 0.12890625, "learning_rate": 0.1, "loss": 2.1917238235473633, "step": 10594 }, { "epoch": 0.3363809523809524, "grad_norm": 0.09130859375, "learning_rate": 0.1, "loss": 2.1571314334869385, "step": 10596 }, { "epoch": 0.33644444444444443, "grad_norm": 0.062255859375, "learning_rate": 0.1, "loss": 2.2026073932647705, "step": 10598 }, { "epoch": 0.33650793650793653, "grad_norm": 0.103515625, "learning_rate": 0.1, "loss": 2.2058682441711426, "step": 10600 }, { "epoch": 0.3365714285714286, "grad_norm": 0.28125, "learning_rate": 0.1, "loss": 2.202815294265747, "step": 10602 }, { "epoch": 0.3366349206349206, "grad_norm": 0.265625, "learning_rate": 0.1, "loss": 2.2016568183898926, "step": 10604 }, { "epoch": 0.3366984126984127, "grad_norm": 0.11279296875, "learning_rate": 0.1, "loss": 2.1916162967681885, "step": 10606 }, { "epoch": 0.33676190476190476, "grad_norm": 0.109375, "learning_rate": 0.1, "loss": 2.1857426166534424, "step": 10608 }, { "epoch": 0.3368253968253968, "grad_norm": 0.08935546875, "learning_rate": 0.1, "loss": 2.2138404846191406, "step": 10610 }, { "epoch": 0.3368888888888889, "grad_norm": 0.1904296875, "learning_rate": 0.1, "loss": 2.1968023777008057, "step": 10612 }, { "epoch": 0.33695238095238095, "grad_norm": 0.1748046875, "learning_rate": 0.1, "loss": 2.193377733230591, "step": 10614 }, { "epoch": 0.337015873015873, "grad_norm": 0.302734375, "learning_rate": 0.1, "loss": 2.1726179122924805, "step": 10616 }, { "epoch": 0.3370793650793651, "grad_norm": 0.2099609375, "learning_rate": 0.1, "loss": 2.1705336570739746, "step": 10618 }, { "epoch": 0.33714285714285713, "grad_norm": 0.205078125, "learning_rate": 0.1, "loss": 2.2049412727355957, "step": 10620 }, { "epoch": 0.33720634920634923, "grad_norm": 0.279296875, "learning_rate": 0.1, "loss": 2.211937665939331, "step": 10622 }, { "epoch": 0.3372698412698413, "grad_norm": 0.1591796875, "learning_rate": 0.1, "loss": 2.178178071975708, "step": 10624 }, { "epoch": 0.3373333333333333, "grad_norm": 0.06005859375, "learning_rate": 0.1, "loss": 2.1817891597747803, "step": 10626 }, { "epoch": 0.3373968253968254, "grad_norm": 0.07373046875, "learning_rate": 0.1, "loss": 2.1902942657470703, "step": 10628 }, { "epoch": 0.33746031746031746, "grad_norm": 0.09716796875, "learning_rate": 0.1, "loss": 2.168306827545166, "step": 10630 }, { "epoch": 0.3375238095238095, "grad_norm": 0.294921875, "learning_rate": 0.1, "loss": 2.1916491985321045, "step": 10632 }, { "epoch": 0.3375873015873016, "grad_norm": 0.1162109375, "learning_rate": 0.1, "loss": 2.20019268989563, "step": 10634 }, { "epoch": 0.33765079365079365, "grad_norm": 0.1015625, "learning_rate": 0.1, "loss": 2.1709628105163574, "step": 10636 }, { "epoch": 0.3377142857142857, "grad_norm": 0.1484375, "learning_rate": 0.1, "loss": 2.164698839187622, "step": 10638 }, { "epoch": 0.3377777777777778, "grad_norm": 0.12890625, "learning_rate": 0.1, "loss": 2.1735100746154785, "step": 10640 }, { "epoch": 0.33784126984126983, "grad_norm": 0.2177734375, "learning_rate": 0.1, "loss": 2.1582412719726562, "step": 10642 }, { "epoch": 0.33790476190476193, "grad_norm": 0.453125, "learning_rate": 0.1, "loss": 2.1964187622070312, "step": 10644 }, { "epoch": 0.337968253968254, "grad_norm": 0.1533203125, "learning_rate": 0.1, "loss": 2.1713085174560547, "step": 10646 }, { "epoch": 0.338031746031746, "grad_norm": 0.0986328125, "learning_rate": 0.1, "loss": 2.190800666809082, "step": 10648 }, { "epoch": 0.3380952380952381, "grad_norm": 0.1044921875, "learning_rate": 0.1, "loss": 2.1849403381347656, "step": 10650 }, { "epoch": 0.33815873015873016, "grad_norm": 0.1513671875, "learning_rate": 0.1, "loss": 2.1842076778411865, "step": 10652 }, { "epoch": 0.3382222222222222, "grad_norm": 0.07861328125, "learning_rate": 0.1, "loss": 2.1835153102874756, "step": 10654 }, { "epoch": 0.3382857142857143, "grad_norm": 0.1201171875, "learning_rate": 0.1, "loss": 2.184572696685791, "step": 10656 }, { "epoch": 0.33834920634920634, "grad_norm": 0.205078125, "learning_rate": 0.1, "loss": 2.20729398727417, "step": 10658 }, { "epoch": 0.3384126984126984, "grad_norm": 0.11328125, "learning_rate": 0.1, "loss": 2.148922920227051, "step": 10660 }, { "epoch": 0.3384761904761905, "grad_norm": 0.0595703125, "learning_rate": 0.1, "loss": 2.197078227996826, "step": 10662 }, { "epoch": 0.33853968253968253, "grad_norm": 0.1455078125, "learning_rate": 0.1, "loss": 2.1765565872192383, "step": 10664 }, { "epoch": 0.33860317460317463, "grad_norm": 0.12109375, "learning_rate": 0.1, "loss": 2.160946846008301, "step": 10666 }, { "epoch": 0.33866666666666667, "grad_norm": 0.138671875, "learning_rate": 0.1, "loss": 2.1720829010009766, "step": 10668 }, { "epoch": 0.3387301587301587, "grad_norm": 0.28515625, "learning_rate": 0.1, "loss": 2.2134289741516113, "step": 10670 }, { "epoch": 0.3387936507936508, "grad_norm": 0.2333984375, "learning_rate": 0.1, "loss": 2.1767289638519287, "step": 10672 }, { "epoch": 0.33885714285714286, "grad_norm": 0.173828125, "learning_rate": 0.1, "loss": 2.1474976539611816, "step": 10674 }, { "epoch": 0.3389206349206349, "grad_norm": 0.1083984375, "learning_rate": 0.1, "loss": 2.1698927879333496, "step": 10676 }, { "epoch": 0.338984126984127, "grad_norm": 0.2421875, "learning_rate": 0.1, "loss": 2.170835494995117, "step": 10678 }, { "epoch": 0.33904761904761904, "grad_norm": 0.193359375, "learning_rate": 0.1, "loss": 2.15285587310791, "step": 10680 }, { "epoch": 0.3391111111111111, "grad_norm": 0.2080078125, "learning_rate": 0.1, "loss": 2.1628880500793457, "step": 10682 }, { "epoch": 0.3391746031746032, "grad_norm": 0.130859375, "learning_rate": 0.1, "loss": 2.1494805812835693, "step": 10684 }, { "epoch": 0.3392380952380952, "grad_norm": 0.052490234375, "learning_rate": 0.1, "loss": 2.1779205799102783, "step": 10686 }, { "epoch": 0.3393015873015873, "grad_norm": 0.0966796875, "learning_rate": 0.1, "loss": 2.150136947631836, "step": 10688 }, { "epoch": 0.33936507936507937, "grad_norm": 0.053466796875, "learning_rate": 0.1, "loss": 2.1458067893981934, "step": 10690 }, { "epoch": 0.3394285714285714, "grad_norm": 0.322265625, "learning_rate": 0.1, "loss": 2.15260910987854, "step": 10692 }, { "epoch": 0.3394920634920635, "grad_norm": 0.279296875, "learning_rate": 0.1, "loss": 2.1635806560516357, "step": 10694 }, { "epoch": 0.33955555555555555, "grad_norm": 0.2890625, "learning_rate": 0.1, "loss": 2.1902191638946533, "step": 10696 }, { "epoch": 0.3396190476190476, "grad_norm": 0.1259765625, "learning_rate": 0.1, "loss": 2.171922206878662, "step": 10698 }, { "epoch": 0.3396825396825397, "grad_norm": 0.09326171875, "learning_rate": 0.1, "loss": 2.1400513648986816, "step": 10700 }, { "epoch": 0.33974603174603174, "grad_norm": 0.1181640625, "learning_rate": 0.1, "loss": 2.1618614196777344, "step": 10702 }, { "epoch": 0.3398095238095238, "grad_norm": 0.17578125, "learning_rate": 0.1, "loss": 2.1420485973358154, "step": 10704 }, { "epoch": 0.3398730158730159, "grad_norm": 0.1025390625, "learning_rate": 0.1, "loss": 2.181180953979492, "step": 10706 }, { "epoch": 0.3399365079365079, "grad_norm": 0.1064453125, "learning_rate": 0.1, "loss": 2.175330877304077, "step": 10708 }, { "epoch": 0.34, "grad_norm": 0.138671875, "learning_rate": 0.1, "loss": 2.1646363735198975, "step": 10710 }, { "epoch": 0.34006349206349207, "grad_norm": 0.057373046875, "learning_rate": 0.1, "loss": 2.1419057846069336, "step": 10712 }, { "epoch": 0.3401269841269841, "grad_norm": 0.1787109375, "learning_rate": 0.1, "loss": 2.1370527744293213, "step": 10714 }, { "epoch": 0.3401904761904762, "grad_norm": 0.0654296875, "learning_rate": 0.1, "loss": 2.1385576725006104, "step": 10716 }, { "epoch": 0.34025396825396825, "grad_norm": 0.13671875, "learning_rate": 0.1, "loss": 2.1344106197357178, "step": 10718 }, { "epoch": 0.3403174603174603, "grad_norm": 0.181640625, "learning_rate": 0.1, "loss": 2.123504877090454, "step": 10720 }, { "epoch": 0.3403809523809524, "grad_norm": 0.240234375, "learning_rate": 0.1, "loss": 2.168992280960083, "step": 10722 }, { "epoch": 0.34044444444444444, "grad_norm": 0.189453125, "learning_rate": 0.1, "loss": 2.182685613632202, "step": 10724 }, { "epoch": 0.3405079365079365, "grad_norm": 0.15234375, "learning_rate": 0.1, "loss": 2.1744816303253174, "step": 10726 }, { "epoch": 0.3405714285714286, "grad_norm": 0.462890625, "learning_rate": 0.1, "loss": 2.1602675914764404, "step": 10728 }, { "epoch": 0.3406349206349206, "grad_norm": 0.08447265625, "learning_rate": 0.1, "loss": 2.1859214305877686, "step": 10730 }, { "epoch": 0.3406984126984127, "grad_norm": 0.06396484375, "learning_rate": 0.1, "loss": 2.151904582977295, "step": 10732 }, { "epoch": 0.34076190476190477, "grad_norm": 0.0791015625, "learning_rate": 0.1, "loss": 2.143826961517334, "step": 10734 }, { "epoch": 0.3408253968253968, "grad_norm": 0.166015625, "learning_rate": 0.1, "loss": 2.1708786487579346, "step": 10736 }, { "epoch": 0.3408888888888889, "grad_norm": 0.1005859375, "learning_rate": 0.1, "loss": 2.1587464809417725, "step": 10738 }, { "epoch": 0.34095238095238095, "grad_norm": 0.1669921875, "learning_rate": 0.1, "loss": 2.159186363220215, "step": 10740 }, { "epoch": 0.341015873015873, "grad_norm": 0.087890625, "learning_rate": 0.1, "loss": 2.132807970046997, "step": 10742 }, { "epoch": 0.3410793650793651, "grad_norm": 0.1923828125, "learning_rate": 0.1, "loss": 2.1830198764801025, "step": 10744 }, { "epoch": 0.34114285714285714, "grad_norm": 0.1083984375, "learning_rate": 0.1, "loss": 2.1978118419647217, "step": 10746 }, { "epoch": 0.3412063492063492, "grad_norm": 0.11669921875, "learning_rate": 0.1, "loss": 2.1586546897888184, "step": 10748 }, { "epoch": 0.3412698412698413, "grad_norm": 0.326171875, "learning_rate": 0.1, "loss": 2.181734323501587, "step": 10750 }, { "epoch": 0.3413333333333333, "grad_norm": 0.12109375, "learning_rate": 0.1, "loss": 2.1670312881469727, "step": 10752 }, { "epoch": 0.3413968253968254, "grad_norm": 0.11962890625, "learning_rate": 0.1, "loss": 2.17029070854187, "step": 10754 }, { "epoch": 0.34146031746031746, "grad_norm": 0.07080078125, "learning_rate": 0.1, "loss": 2.166867971420288, "step": 10756 }, { "epoch": 0.3415238095238095, "grad_norm": 0.08544921875, "learning_rate": 0.1, "loss": 2.1606602668762207, "step": 10758 }, { "epoch": 0.3415873015873016, "grad_norm": 0.1474609375, "learning_rate": 0.1, "loss": 2.183364152908325, "step": 10760 }, { "epoch": 0.34165079365079365, "grad_norm": 0.2412109375, "learning_rate": 0.1, "loss": 2.166038990020752, "step": 10762 }, { "epoch": 0.3417142857142857, "grad_norm": 0.2255859375, "learning_rate": 0.1, "loss": 2.1788270473480225, "step": 10764 }, { "epoch": 0.3417777777777778, "grad_norm": 0.107421875, "learning_rate": 0.1, "loss": 2.1759607791900635, "step": 10766 }, { "epoch": 0.34184126984126983, "grad_norm": 0.054931640625, "learning_rate": 0.1, "loss": 2.1977617740631104, "step": 10768 }, { "epoch": 0.3419047619047619, "grad_norm": 0.197265625, "learning_rate": 0.1, "loss": 2.138373851776123, "step": 10770 }, { "epoch": 0.341968253968254, "grad_norm": 0.259765625, "learning_rate": 0.1, "loss": 2.1617345809936523, "step": 10772 }, { "epoch": 0.342031746031746, "grad_norm": 0.34765625, "learning_rate": 0.1, "loss": 2.158432960510254, "step": 10774 }, { "epoch": 0.3420952380952381, "grad_norm": 0.050537109375, "learning_rate": 0.1, "loss": 2.165778875350952, "step": 10776 }, { "epoch": 0.34215873015873016, "grad_norm": 0.28125, "learning_rate": 0.1, "loss": 2.1905174255371094, "step": 10778 }, { "epoch": 0.3422222222222222, "grad_norm": 0.1396484375, "learning_rate": 0.1, "loss": 2.18183970451355, "step": 10780 }, { "epoch": 0.3422857142857143, "grad_norm": 0.2265625, "learning_rate": 0.1, "loss": 2.1712934970855713, "step": 10782 }, { "epoch": 0.34234920634920635, "grad_norm": 0.279296875, "learning_rate": 0.1, "loss": 2.16552996635437, "step": 10784 }, { "epoch": 0.3424126984126984, "grad_norm": 0.123046875, "learning_rate": 0.1, "loss": 2.1844046115875244, "step": 10786 }, { "epoch": 0.3424761904761905, "grad_norm": 0.10498046875, "learning_rate": 0.1, "loss": 2.1611874103546143, "step": 10788 }, { "epoch": 0.34253968253968253, "grad_norm": 0.12890625, "learning_rate": 0.1, "loss": 2.1646482944488525, "step": 10790 }, { "epoch": 0.3426031746031746, "grad_norm": 0.11669921875, "learning_rate": 0.1, "loss": 2.1656932830810547, "step": 10792 }, { "epoch": 0.3426666666666667, "grad_norm": 0.134765625, "learning_rate": 0.1, "loss": 2.164336681365967, "step": 10794 }, { "epoch": 0.3427301587301587, "grad_norm": 0.0703125, "learning_rate": 0.1, "loss": 2.159842014312744, "step": 10796 }, { "epoch": 0.3427936507936508, "grad_norm": 0.1181640625, "learning_rate": 0.1, "loss": 2.188279151916504, "step": 10798 }, { "epoch": 0.34285714285714286, "grad_norm": 0.1845703125, "learning_rate": 0.1, "loss": 2.165369749069214, "step": 10800 }, { "epoch": 0.3429206349206349, "grad_norm": 0.1796875, "learning_rate": 0.1, "loss": 2.1752822399139404, "step": 10802 }, { "epoch": 0.342984126984127, "grad_norm": 0.08740234375, "learning_rate": 0.1, "loss": 2.1573591232299805, "step": 10804 }, { "epoch": 0.34304761904761905, "grad_norm": 0.216796875, "learning_rate": 0.1, "loss": 2.154291868209839, "step": 10806 }, { "epoch": 0.3431111111111111, "grad_norm": 0.2236328125, "learning_rate": 0.1, "loss": 2.1773228645324707, "step": 10808 }, { "epoch": 0.3431746031746032, "grad_norm": 0.310546875, "learning_rate": 0.1, "loss": 2.132742166519165, "step": 10810 }, { "epoch": 0.34323809523809523, "grad_norm": 0.07275390625, "learning_rate": 0.1, "loss": 2.16198992729187, "step": 10812 }, { "epoch": 0.3433015873015873, "grad_norm": 0.2158203125, "learning_rate": 0.1, "loss": 2.1417953968048096, "step": 10814 }, { "epoch": 0.3433650793650794, "grad_norm": 0.234375, "learning_rate": 0.1, "loss": 2.16147780418396, "step": 10816 }, { "epoch": 0.3434285714285714, "grad_norm": 0.1279296875, "learning_rate": 0.1, "loss": 2.16763973236084, "step": 10818 }, { "epoch": 0.3434920634920635, "grad_norm": 0.1337890625, "learning_rate": 0.1, "loss": 2.165390729904175, "step": 10820 }, { "epoch": 0.34355555555555556, "grad_norm": 0.15625, "learning_rate": 0.1, "loss": 2.1336190700531006, "step": 10822 }, { "epoch": 0.3436190476190476, "grad_norm": 0.134765625, "learning_rate": 0.1, "loss": 2.1284821033477783, "step": 10824 }, { "epoch": 0.3436825396825397, "grad_norm": 0.2099609375, "learning_rate": 0.1, "loss": 2.205808639526367, "step": 10826 }, { "epoch": 0.34374603174603174, "grad_norm": 0.158203125, "learning_rate": 0.1, "loss": 2.168703556060791, "step": 10828 }, { "epoch": 0.3438095238095238, "grad_norm": 0.26171875, "learning_rate": 0.1, "loss": 2.206918239593506, "step": 10830 }, { "epoch": 0.3438730158730159, "grad_norm": 0.173828125, "learning_rate": 0.1, "loss": 2.19317364692688, "step": 10832 }, { "epoch": 0.34393650793650793, "grad_norm": 0.1669921875, "learning_rate": 0.1, "loss": 2.1836607456207275, "step": 10834 }, { "epoch": 0.344, "grad_norm": 0.224609375, "learning_rate": 0.1, "loss": 2.1935431957244873, "step": 10836 }, { "epoch": 0.34406349206349207, "grad_norm": 0.1494140625, "learning_rate": 0.1, "loss": 2.2080745697021484, "step": 10838 }, { "epoch": 0.3441269841269841, "grad_norm": 0.216796875, "learning_rate": 0.1, "loss": 2.1659133434295654, "step": 10840 }, { "epoch": 0.3441904761904762, "grad_norm": 0.1650390625, "learning_rate": 0.1, "loss": 2.208919048309326, "step": 10842 }, { "epoch": 0.34425396825396826, "grad_norm": 0.10986328125, "learning_rate": 0.1, "loss": 2.192574977874756, "step": 10844 }, { "epoch": 0.3443174603174603, "grad_norm": 0.09619140625, "learning_rate": 0.1, "loss": 2.2011077404022217, "step": 10846 }, { "epoch": 0.3443809523809524, "grad_norm": 0.166015625, "learning_rate": 0.1, "loss": 2.2008471488952637, "step": 10848 }, { "epoch": 0.34444444444444444, "grad_norm": 0.15625, "learning_rate": 0.1, "loss": 2.2248358726501465, "step": 10850 }, { "epoch": 0.3445079365079365, "grad_norm": 0.1005859375, "learning_rate": 0.1, "loss": 2.2257256507873535, "step": 10852 }, { "epoch": 0.3445714285714286, "grad_norm": 0.150390625, "learning_rate": 0.1, "loss": 2.222543239593506, "step": 10854 }, { "epoch": 0.3446349206349206, "grad_norm": 0.20703125, "learning_rate": 0.1, "loss": 2.2047646045684814, "step": 10856 }, { "epoch": 0.3446984126984127, "grad_norm": 0.0478515625, "learning_rate": 0.1, "loss": 2.201016426086426, "step": 10858 }, { "epoch": 0.34476190476190477, "grad_norm": 0.30859375, "learning_rate": 0.1, "loss": 2.212066650390625, "step": 10860 }, { "epoch": 0.3448253968253968, "grad_norm": 0.65234375, "learning_rate": 0.1, "loss": 2.2267720699310303, "step": 10862 }, { "epoch": 0.3448888888888889, "grad_norm": 0.07763671875, "learning_rate": 0.1, "loss": 2.213447332382202, "step": 10864 }, { "epoch": 0.34495238095238095, "grad_norm": 0.09765625, "learning_rate": 0.1, "loss": 2.2096965312957764, "step": 10866 }, { "epoch": 0.345015873015873, "grad_norm": 0.09375, "learning_rate": 0.1, "loss": 2.217432737350464, "step": 10868 }, { "epoch": 0.3450793650793651, "grad_norm": 0.12255859375, "learning_rate": 0.1, "loss": 2.2168514728546143, "step": 10870 }, { "epoch": 0.34514285714285714, "grad_norm": 0.08154296875, "learning_rate": 0.1, "loss": 2.2287988662719727, "step": 10872 }, { "epoch": 0.3452063492063492, "grad_norm": 0.11767578125, "learning_rate": 0.1, "loss": 2.2565572261810303, "step": 10874 }, { "epoch": 0.3452698412698413, "grad_norm": 0.3828125, "learning_rate": 0.1, "loss": 2.2269654273986816, "step": 10876 }, { "epoch": 0.3453333333333333, "grad_norm": 0.3671875, "learning_rate": 0.1, "loss": 2.219343662261963, "step": 10878 }, { "epoch": 0.3453968253968254, "grad_norm": 0.13671875, "learning_rate": 0.1, "loss": 2.265392303466797, "step": 10880 }, { "epoch": 0.34546031746031747, "grad_norm": 0.10693359375, "learning_rate": 0.1, "loss": 2.259321689605713, "step": 10882 }, { "epoch": 0.3455238095238095, "grad_norm": 0.060302734375, "learning_rate": 0.1, "loss": 2.2631168365478516, "step": 10884 }, { "epoch": 0.3455873015873016, "grad_norm": 0.185546875, "learning_rate": 0.1, "loss": 2.230673313140869, "step": 10886 }, { "epoch": 0.34565079365079365, "grad_norm": 0.060546875, "learning_rate": 0.1, "loss": 2.241331100463867, "step": 10888 }, { "epoch": 0.3457142857142857, "grad_norm": 0.091796875, "learning_rate": 0.1, "loss": 2.260580062866211, "step": 10890 }, { "epoch": 0.3457777777777778, "grad_norm": 0.1806640625, "learning_rate": 0.1, "loss": 2.268975257873535, "step": 10892 }, { "epoch": 0.34584126984126984, "grad_norm": 0.1318359375, "learning_rate": 0.1, "loss": 2.2483866214752197, "step": 10894 }, { "epoch": 0.3459047619047619, "grad_norm": 0.1123046875, "learning_rate": 0.1, "loss": 2.2478039264678955, "step": 10896 }, { "epoch": 0.345968253968254, "grad_norm": 0.12353515625, "learning_rate": 0.1, "loss": 2.266867160797119, "step": 10898 }, { "epoch": 0.346031746031746, "grad_norm": 0.2119140625, "learning_rate": 0.1, "loss": 2.2199819087982178, "step": 10900 }, { "epoch": 0.3460952380952381, "grad_norm": 0.2041015625, "learning_rate": 0.1, "loss": 2.24721097946167, "step": 10902 }, { "epoch": 0.34615873015873017, "grad_norm": 0.0693359375, "learning_rate": 0.1, "loss": 2.2504703998565674, "step": 10904 }, { "epoch": 0.3462222222222222, "grad_norm": 0.208984375, "learning_rate": 0.1, "loss": 2.250040054321289, "step": 10906 }, { "epoch": 0.3462857142857143, "grad_norm": 0.373046875, "learning_rate": 0.1, "loss": 2.261671781539917, "step": 10908 }, { "epoch": 0.34634920634920635, "grad_norm": 0.058837890625, "learning_rate": 0.1, "loss": 2.241607427597046, "step": 10910 }, { "epoch": 0.3464126984126984, "grad_norm": 0.25390625, "learning_rate": 0.1, "loss": 2.230332612991333, "step": 10912 }, { "epoch": 0.3464761904761905, "grad_norm": 0.2373046875, "learning_rate": 0.1, "loss": 2.251079559326172, "step": 10914 }, { "epoch": 0.34653968253968254, "grad_norm": 0.1474609375, "learning_rate": 0.1, "loss": 2.264204740524292, "step": 10916 }, { "epoch": 0.3466031746031746, "grad_norm": 0.146484375, "learning_rate": 0.1, "loss": 2.2764639854431152, "step": 10918 }, { "epoch": 0.3466666666666667, "grad_norm": 0.162109375, "learning_rate": 0.1, "loss": 2.271958112716675, "step": 10920 }, { "epoch": 0.3467301587301587, "grad_norm": 0.1162109375, "learning_rate": 0.1, "loss": 2.2935328483581543, "step": 10922 }, { "epoch": 0.3467936507936508, "grad_norm": 0.10205078125, "learning_rate": 0.1, "loss": 2.3027031421661377, "step": 10924 }, { "epoch": 0.34685714285714286, "grad_norm": 0.10107421875, "learning_rate": 0.1, "loss": 2.275430202484131, "step": 10926 }, { "epoch": 0.3469206349206349, "grad_norm": 0.109375, "learning_rate": 0.1, "loss": 2.2565975189208984, "step": 10928 }, { "epoch": 0.346984126984127, "grad_norm": 0.1171875, "learning_rate": 0.1, "loss": 2.2793102264404297, "step": 10930 }, { "epoch": 0.34704761904761905, "grad_norm": 0.2578125, "learning_rate": 0.1, "loss": 2.299699306488037, "step": 10932 }, { "epoch": 0.3471111111111111, "grad_norm": 0.33984375, "learning_rate": 0.1, "loss": 2.326904296875, "step": 10934 }, { "epoch": 0.3471746031746032, "grad_norm": 0.2236328125, "learning_rate": 0.1, "loss": 2.2868425846099854, "step": 10936 }, { "epoch": 0.34723809523809523, "grad_norm": 0.3515625, "learning_rate": 0.1, "loss": 2.2794508934020996, "step": 10938 }, { "epoch": 0.3473015873015873, "grad_norm": 0.2021484375, "learning_rate": 0.1, "loss": 2.3169283866882324, "step": 10940 }, { "epoch": 0.3473650793650794, "grad_norm": 0.2578125, "learning_rate": 0.1, "loss": 2.2692878246307373, "step": 10942 }, { "epoch": 0.3474285714285714, "grad_norm": 0.29296875, "learning_rate": 0.1, "loss": 2.31179141998291, "step": 10944 }, { "epoch": 0.3474920634920635, "grad_norm": 0.0517578125, "learning_rate": 0.1, "loss": 2.3011832237243652, "step": 10946 }, { "epoch": 0.34755555555555556, "grad_norm": 0.099609375, "learning_rate": 0.1, "loss": 2.289245128631592, "step": 10948 }, { "epoch": 0.3476190476190476, "grad_norm": 0.1005859375, "learning_rate": 0.1, "loss": 2.3354389667510986, "step": 10950 }, { "epoch": 0.3476825396825397, "grad_norm": 0.2236328125, "learning_rate": 0.1, "loss": 2.2830705642700195, "step": 10952 }, { "epoch": 0.34774603174603175, "grad_norm": 0.12060546875, "learning_rate": 0.1, "loss": 2.2964699268341064, "step": 10954 }, { "epoch": 0.3478095238095238, "grad_norm": 0.1083984375, "learning_rate": 0.1, "loss": 2.3209269046783447, "step": 10956 }, { "epoch": 0.3478730158730159, "grad_norm": 0.19140625, "learning_rate": 0.1, "loss": 2.3036513328552246, "step": 10958 }, { "epoch": 0.34793650793650793, "grad_norm": 0.328125, "learning_rate": 0.1, "loss": 2.296513795852661, "step": 10960 }, { "epoch": 0.348, "grad_norm": 0.109375, "learning_rate": 0.1, "loss": 2.307774066925049, "step": 10962 }, { "epoch": 0.3480634920634921, "grad_norm": 0.0712890625, "learning_rate": 0.1, "loss": 2.314592123031616, "step": 10964 }, { "epoch": 0.3481269841269841, "grad_norm": 0.07763671875, "learning_rate": 0.1, "loss": 2.3010873794555664, "step": 10966 }, { "epoch": 0.3481904761904762, "grad_norm": 0.07666015625, "learning_rate": 0.1, "loss": 2.314419746398926, "step": 10968 }, { "epoch": 0.34825396825396826, "grad_norm": 0.1083984375, "learning_rate": 0.1, "loss": 2.320044994354248, "step": 10970 }, { "epoch": 0.3483174603174603, "grad_norm": 0.12109375, "learning_rate": 0.1, "loss": 2.3089184761047363, "step": 10972 }, { "epoch": 0.3483809523809524, "grad_norm": 0.07373046875, "learning_rate": 0.1, "loss": 2.3317675590515137, "step": 10974 }, { "epoch": 0.34844444444444445, "grad_norm": 0.14453125, "learning_rate": 0.1, "loss": 2.3759024143218994, "step": 10976 }, { "epoch": 0.3485079365079365, "grad_norm": 0.51953125, "learning_rate": 0.1, "loss": 2.3379838466644287, "step": 10978 }, { "epoch": 0.3485714285714286, "grad_norm": 0.337890625, "learning_rate": 0.1, "loss": 2.3536903858184814, "step": 10980 }, { "epoch": 0.34863492063492063, "grad_norm": 0.267578125, "learning_rate": 0.1, "loss": 2.3582966327667236, "step": 10982 }, { "epoch": 0.3486984126984127, "grad_norm": 0.12353515625, "learning_rate": 0.1, "loss": 2.3518645763397217, "step": 10984 }, { "epoch": 0.3487619047619048, "grad_norm": 0.24609375, "learning_rate": 0.1, "loss": 2.381120204925537, "step": 10986 }, { "epoch": 0.3488253968253968, "grad_norm": 0.31640625, "learning_rate": 0.1, "loss": 2.351613998413086, "step": 10988 }, { "epoch": 0.3488888888888889, "grad_norm": 0.12451171875, "learning_rate": 0.1, "loss": 2.362210988998413, "step": 10990 }, { "epoch": 0.34895238095238096, "grad_norm": 0.1318359375, "learning_rate": 0.1, "loss": 2.353654623031616, "step": 10992 }, { "epoch": 0.349015873015873, "grad_norm": 0.08740234375, "learning_rate": 0.1, "loss": 2.371495485305786, "step": 10994 }, { "epoch": 0.3490793650793651, "grad_norm": 0.15625, "learning_rate": 0.1, "loss": 2.387871026992798, "step": 10996 }, { "epoch": 0.34914285714285714, "grad_norm": 0.0830078125, "learning_rate": 0.1, "loss": 2.3548786640167236, "step": 10998 }, { "epoch": 0.3492063492063492, "grad_norm": 0.099609375, "learning_rate": 0.1, "loss": 2.38694429397583, "step": 11000 }, { "epoch": 0.3492698412698413, "grad_norm": 0.125, "learning_rate": 0.1, "loss": 2.3750088214874268, "step": 11002 }, { "epoch": 0.34933333333333333, "grad_norm": 0.326171875, "learning_rate": 0.1, "loss": 2.3959736824035645, "step": 11004 }, { "epoch": 0.3493968253968254, "grad_norm": 0.236328125, "learning_rate": 0.1, "loss": 2.3784327507019043, "step": 11006 }, { "epoch": 0.34946031746031747, "grad_norm": 0.142578125, "learning_rate": 0.1, "loss": 2.387637138366699, "step": 11008 }, { "epoch": 0.3495238095238095, "grad_norm": 0.1943359375, "learning_rate": 0.1, "loss": 2.3979554176330566, "step": 11010 }, { "epoch": 0.3495873015873016, "grad_norm": 0.396484375, "learning_rate": 0.1, "loss": 2.3882758617401123, "step": 11012 }, { "epoch": 0.34965079365079366, "grad_norm": 0.11376953125, "learning_rate": 0.1, "loss": 2.3989250659942627, "step": 11014 }, { "epoch": 0.3497142857142857, "grad_norm": 0.0439453125, "learning_rate": 0.1, "loss": 2.4110400676727295, "step": 11016 }, { "epoch": 0.3497777777777778, "grad_norm": 0.053955078125, "learning_rate": 0.1, "loss": 2.3825628757476807, "step": 11018 }, { "epoch": 0.34984126984126984, "grad_norm": 0.06884765625, "learning_rate": 0.1, "loss": 2.39042329788208, "step": 11020 }, { "epoch": 0.3499047619047619, "grad_norm": 0.1376953125, "learning_rate": 0.1, "loss": 2.4223592281341553, "step": 11022 }, { "epoch": 0.349968253968254, "grad_norm": 0.13671875, "learning_rate": 0.1, "loss": 2.3982603549957275, "step": 11024 }, { "epoch": 0.350031746031746, "grad_norm": 0.11767578125, "learning_rate": 0.1, "loss": 2.417543888092041, "step": 11026 }, { "epoch": 0.35009523809523807, "grad_norm": 0.4921875, "learning_rate": 0.1, "loss": 2.4204814434051514, "step": 11028 }, { "epoch": 0.35015873015873017, "grad_norm": 0.2470703125, "learning_rate": 0.1, "loss": 2.423769235610962, "step": 11030 }, { "epoch": 0.3502222222222222, "grad_norm": 0.26171875, "learning_rate": 0.1, "loss": 2.403280019760132, "step": 11032 }, { "epoch": 0.3502857142857143, "grad_norm": 0.12451171875, "learning_rate": 0.1, "loss": 2.391350746154785, "step": 11034 }, { "epoch": 0.35034920634920635, "grad_norm": 0.166015625, "learning_rate": 0.1, "loss": 2.422663927078247, "step": 11036 }, { "epoch": 0.3504126984126984, "grad_norm": 0.4140625, "learning_rate": 0.1, "loss": 2.4171299934387207, "step": 11038 }, { "epoch": 0.3504761904761905, "grad_norm": 0.287109375, "learning_rate": 0.1, "loss": 2.4181694984436035, "step": 11040 }, { "epoch": 0.35053968253968254, "grad_norm": 0.181640625, "learning_rate": 0.1, "loss": 2.4219069480895996, "step": 11042 }, { "epoch": 0.3506031746031746, "grad_norm": 0.05517578125, "learning_rate": 0.1, "loss": 2.4353079795837402, "step": 11044 }, { "epoch": 0.3506666666666667, "grad_norm": 0.1513671875, "learning_rate": 0.1, "loss": 2.4211533069610596, "step": 11046 }, { "epoch": 0.3507301587301587, "grad_norm": 0.21484375, "learning_rate": 0.1, "loss": 2.4109206199645996, "step": 11048 }, { "epoch": 0.35079365079365077, "grad_norm": 0.234375, "learning_rate": 0.1, "loss": 2.42941951751709, "step": 11050 }, { "epoch": 0.35085714285714287, "grad_norm": 0.1474609375, "learning_rate": 0.1, "loss": 2.3879783153533936, "step": 11052 }, { "epoch": 0.3509206349206349, "grad_norm": 0.359375, "learning_rate": 0.1, "loss": 2.4356982707977295, "step": 11054 }, { "epoch": 0.350984126984127, "grad_norm": 0.212890625, "learning_rate": 0.1, "loss": 2.4666500091552734, "step": 11056 }, { "epoch": 0.35104761904761905, "grad_norm": 0.072265625, "learning_rate": 0.1, "loss": 2.4236481189727783, "step": 11058 }, { "epoch": 0.3511111111111111, "grad_norm": 0.2421875, "learning_rate": 0.1, "loss": 2.4124252796173096, "step": 11060 }, { "epoch": 0.3511746031746032, "grad_norm": 0.314453125, "learning_rate": 0.1, "loss": 2.432394027709961, "step": 11062 }, { "epoch": 0.35123809523809524, "grad_norm": 0.14453125, "learning_rate": 0.1, "loss": 2.4237029552459717, "step": 11064 }, { "epoch": 0.3513015873015873, "grad_norm": 0.1044921875, "learning_rate": 0.1, "loss": 2.424628496170044, "step": 11066 }, { "epoch": 0.3513650793650794, "grad_norm": 0.310546875, "learning_rate": 0.1, "loss": 2.4362010955810547, "step": 11068 }, { "epoch": 0.3514285714285714, "grad_norm": 0.203125, "learning_rate": 0.1, "loss": 2.437283992767334, "step": 11070 }, { "epoch": 0.35149206349206347, "grad_norm": 0.107421875, "learning_rate": 0.1, "loss": 2.446931838989258, "step": 11072 }, { "epoch": 0.35155555555555557, "grad_norm": 0.1591796875, "learning_rate": 0.1, "loss": 2.439727783203125, "step": 11074 }, { "epoch": 0.3516190476190476, "grad_norm": 0.11474609375, "learning_rate": 0.1, "loss": 2.451850175857544, "step": 11076 }, { "epoch": 0.3516825396825397, "grad_norm": 0.185546875, "learning_rate": 0.1, "loss": 2.431602954864502, "step": 11078 }, { "epoch": 0.35174603174603175, "grad_norm": 0.33984375, "learning_rate": 0.1, "loss": 2.4908480644226074, "step": 11080 }, { "epoch": 0.3518095238095238, "grad_norm": 0.2431640625, "learning_rate": 0.1, "loss": 2.444524049758911, "step": 11082 }, { "epoch": 0.3518730158730159, "grad_norm": 0.244140625, "learning_rate": 0.1, "loss": 2.432138204574585, "step": 11084 }, { "epoch": 0.35193650793650794, "grad_norm": 0.310546875, "learning_rate": 0.1, "loss": 2.4384355545043945, "step": 11086 }, { "epoch": 0.352, "grad_norm": 0.224609375, "learning_rate": 0.1, "loss": 2.4333276748657227, "step": 11088 }, { "epoch": 0.3520634920634921, "grad_norm": 0.11474609375, "learning_rate": 0.1, "loss": 2.467416524887085, "step": 11090 }, { "epoch": 0.3521269841269841, "grad_norm": 0.1923828125, "learning_rate": 0.1, "loss": 2.4466445446014404, "step": 11092 }, { "epoch": 0.35219047619047616, "grad_norm": 0.37890625, "learning_rate": 0.1, "loss": 2.461064338684082, "step": 11094 }, { "epoch": 0.35225396825396826, "grad_norm": 0.291015625, "learning_rate": 0.1, "loss": 2.464916229248047, "step": 11096 }, { "epoch": 0.3523174603174603, "grad_norm": 0.0771484375, "learning_rate": 0.1, "loss": 2.4417896270751953, "step": 11098 }, { "epoch": 0.3523809523809524, "grad_norm": 0.1767578125, "learning_rate": 0.1, "loss": 2.4504122734069824, "step": 11100 }, { "epoch": 0.35244444444444445, "grad_norm": 0.25390625, "learning_rate": 0.1, "loss": 2.46895694732666, "step": 11102 }, { "epoch": 0.3525079365079365, "grad_norm": 0.125, "learning_rate": 0.1, "loss": 2.4499869346618652, "step": 11104 }, { "epoch": 0.3525714285714286, "grad_norm": 0.046875, "learning_rate": 0.1, "loss": 2.463418483734131, "step": 11106 }, { "epoch": 0.35263492063492063, "grad_norm": 0.091796875, "learning_rate": 0.1, "loss": 2.453564405441284, "step": 11108 }, { "epoch": 0.3526984126984127, "grad_norm": 0.1982421875, "learning_rate": 0.1, "loss": 2.4937078952789307, "step": 11110 }, { "epoch": 0.3527619047619048, "grad_norm": 0.486328125, "learning_rate": 0.1, "loss": 2.4809000492095947, "step": 11112 }, { "epoch": 0.3528253968253968, "grad_norm": 0.30078125, "learning_rate": 0.1, "loss": 2.503629446029663, "step": 11114 }, { "epoch": 0.35288888888888886, "grad_norm": 0.12451171875, "learning_rate": 0.1, "loss": 2.464132070541382, "step": 11116 }, { "epoch": 0.35295238095238096, "grad_norm": 0.341796875, "learning_rate": 0.1, "loss": 2.5101478099823, "step": 11118 }, { "epoch": 0.353015873015873, "grad_norm": 0.328125, "learning_rate": 0.1, "loss": 2.492459297180176, "step": 11120 }, { "epoch": 0.3530793650793651, "grad_norm": 0.07666015625, "learning_rate": 0.1, "loss": 2.4816741943359375, "step": 11122 }, { "epoch": 0.35314285714285715, "grad_norm": 0.1513671875, "learning_rate": 0.1, "loss": 2.4845693111419678, "step": 11124 }, { "epoch": 0.3532063492063492, "grad_norm": 0.11376953125, "learning_rate": 0.1, "loss": 2.47183895111084, "step": 11126 }, { "epoch": 0.3532698412698413, "grad_norm": 0.1982421875, "learning_rate": 0.1, "loss": 2.5153868198394775, "step": 11128 }, { "epoch": 0.35333333333333333, "grad_norm": 0.1748046875, "learning_rate": 0.1, "loss": 2.5130763053894043, "step": 11130 }, { "epoch": 0.3533968253968254, "grad_norm": 0.1484375, "learning_rate": 0.1, "loss": 2.487558126449585, "step": 11132 }, { "epoch": 0.3534603174603175, "grad_norm": 0.1572265625, "learning_rate": 0.1, "loss": 2.501115322113037, "step": 11134 }, { "epoch": 0.3535238095238095, "grad_norm": 0.08154296875, "learning_rate": 0.1, "loss": 2.516602039337158, "step": 11136 }, { "epoch": 0.35358730158730156, "grad_norm": 0.341796875, "learning_rate": 0.1, "loss": 2.49763560295105, "step": 11138 }, { "epoch": 0.35365079365079366, "grad_norm": 0.482421875, "learning_rate": 0.1, "loss": 2.4931797981262207, "step": 11140 }, { "epoch": 0.3537142857142857, "grad_norm": 0.26953125, "learning_rate": 0.1, "loss": 2.5045690536499023, "step": 11142 }, { "epoch": 0.3537777777777778, "grad_norm": 0.318359375, "learning_rate": 0.1, "loss": 2.4567301273345947, "step": 11144 }, { "epoch": 0.35384126984126985, "grad_norm": 0.4375, "learning_rate": 0.1, "loss": 2.4942221641540527, "step": 11146 }, { "epoch": 0.3539047619047619, "grad_norm": 0.263671875, "learning_rate": 0.1, "loss": 2.494699001312256, "step": 11148 }, { "epoch": 0.353968253968254, "grad_norm": 0.185546875, "learning_rate": 0.1, "loss": 2.503068685531616, "step": 11150 }, { "epoch": 0.35403174603174603, "grad_norm": 0.154296875, "learning_rate": 0.1, "loss": 2.525214672088623, "step": 11152 }, { "epoch": 0.3540952380952381, "grad_norm": 0.087890625, "learning_rate": 0.1, "loss": 2.475733518600464, "step": 11154 }, { "epoch": 0.3541587301587302, "grad_norm": 0.201171875, "learning_rate": 0.1, "loss": 2.518965005874634, "step": 11156 }, { "epoch": 0.3542222222222222, "grad_norm": 0.162109375, "learning_rate": 0.1, "loss": 2.4847307205200195, "step": 11158 }, { "epoch": 0.35428571428571426, "grad_norm": 0.126953125, "learning_rate": 0.1, "loss": 2.5241191387176514, "step": 11160 }, { "epoch": 0.35434920634920636, "grad_norm": 0.0673828125, "learning_rate": 0.1, "loss": 2.5268568992614746, "step": 11162 }, { "epoch": 0.3544126984126984, "grad_norm": 0.18359375, "learning_rate": 0.1, "loss": 2.489588975906372, "step": 11164 }, { "epoch": 0.3544761904761905, "grad_norm": 0.19921875, "learning_rate": 0.1, "loss": 2.517854928970337, "step": 11166 }, { "epoch": 0.35453968253968254, "grad_norm": 0.1796875, "learning_rate": 0.1, "loss": 2.5115416049957275, "step": 11168 }, { "epoch": 0.3546031746031746, "grad_norm": 0.126953125, "learning_rate": 0.1, "loss": 2.5270791053771973, "step": 11170 }, { "epoch": 0.3546666666666667, "grad_norm": 0.189453125, "learning_rate": 0.1, "loss": 2.49017596244812, "step": 11172 }, { "epoch": 0.35473015873015873, "grad_norm": 0.07080078125, "learning_rate": 0.1, "loss": 2.486820697784424, "step": 11174 }, { "epoch": 0.35479365079365077, "grad_norm": 0.181640625, "learning_rate": 0.1, "loss": 2.4675469398498535, "step": 11176 }, { "epoch": 0.35485714285714287, "grad_norm": 0.5625, "learning_rate": 0.1, "loss": 2.5254688262939453, "step": 11178 }, { "epoch": 0.3549206349206349, "grad_norm": 0.2119140625, "learning_rate": 0.1, "loss": 2.49434494972229, "step": 11180 }, { "epoch": 0.35498412698412696, "grad_norm": 0.142578125, "learning_rate": 0.1, "loss": 2.473165273666382, "step": 11182 }, { "epoch": 0.35504761904761906, "grad_norm": 0.3125, "learning_rate": 0.1, "loss": 2.496553421020508, "step": 11184 }, { "epoch": 0.3551111111111111, "grad_norm": 0.609375, "learning_rate": 0.1, "loss": 2.5094237327575684, "step": 11186 }, { "epoch": 0.3551746031746032, "grad_norm": 0.154296875, "learning_rate": 0.1, "loss": 2.4690942764282227, "step": 11188 }, { "epoch": 0.35523809523809524, "grad_norm": 0.1591796875, "learning_rate": 0.1, "loss": 2.4825117588043213, "step": 11190 }, { "epoch": 0.3553015873015873, "grad_norm": 0.259765625, "learning_rate": 0.1, "loss": 2.530270576477051, "step": 11192 }, { "epoch": 0.3553650793650794, "grad_norm": 0.0947265625, "learning_rate": 0.1, "loss": 2.500499725341797, "step": 11194 }, { "epoch": 0.3554285714285714, "grad_norm": 0.0849609375, "learning_rate": 0.1, "loss": 2.483163595199585, "step": 11196 }, { "epoch": 0.35549206349206347, "grad_norm": 0.09130859375, "learning_rate": 0.1, "loss": 2.5392792224884033, "step": 11198 }, { "epoch": 0.35555555555555557, "grad_norm": 0.0751953125, "learning_rate": 0.1, "loss": 2.4986934661865234, "step": 11200 }, { "epoch": 0.3556190476190476, "grad_norm": 0.04931640625, "learning_rate": 0.1, "loss": 2.480569362640381, "step": 11202 }, { "epoch": 0.35568253968253966, "grad_norm": 0.06494140625, "learning_rate": 0.1, "loss": 2.4624855518341064, "step": 11204 }, { "epoch": 0.35574603174603175, "grad_norm": 0.197265625, "learning_rate": 0.1, "loss": 2.493828296661377, "step": 11206 }, { "epoch": 0.3558095238095238, "grad_norm": 0.203125, "learning_rate": 0.1, "loss": 2.491539239883423, "step": 11208 }, { "epoch": 0.3558730158730159, "grad_norm": 0.0478515625, "learning_rate": 0.1, "loss": 2.497725486755371, "step": 11210 }, { "epoch": 0.35593650793650794, "grad_norm": 0.1474609375, "learning_rate": 0.1, "loss": 2.4691996574401855, "step": 11212 }, { "epoch": 0.356, "grad_norm": 0.3671875, "learning_rate": 0.1, "loss": 2.4893603324890137, "step": 11214 }, { "epoch": 0.3560634920634921, "grad_norm": 0.2734375, "learning_rate": 0.1, "loss": 2.4978229999542236, "step": 11216 }, { "epoch": 0.3561269841269841, "grad_norm": 0.053466796875, "learning_rate": 0.1, "loss": 2.4881231784820557, "step": 11218 }, { "epoch": 0.35619047619047617, "grad_norm": 0.10595703125, "learning_rate": 0.1, "loss": 2.46063232421875, "step": 11220 }, { "epoch": 0.35625396825396827, "grad_norm": 0.09033203125, "learning_rate": 0.1, "loss": 2.5022330284118652, "step": 11222 }, { "epoch": 0.3563174603174603, "grad_norm": 0.21875, "learning_rate": 0.1, "loss": 2.5123372077941895, "step": 11224 }, { "epoch": 0.35638095238095235, "grad_norm": 0.62890625, "learning_rate": 0.1, "loss": 2.4963247776031494, "step": 11226 }, { "epoch": 0.35644444444444445, "grad_norm": 0.166015625, "learning_rate": 0.1, "loss": 2.4859702587127686, "step": 11228 }, { "epoch": 0.3565079365079365, "grad_norm": 0.1298828125, "learning_rate": 0.1, "loss": 2.475501775741577, "step": 11230 }, { "epoch": 0.3565714285714286, "grad_norm": 0.291015625, "learning_rate": 0.1, "loss": 2.490372657775879, "step": 11232 }, { "epoch": 0.35663492063492064, "grad_norm": 0.349609375, "learning_rate": 0.1, "loss": 2.480090618133545, "step": 11234 }, { "epoch": 0.3566984126984127, "grad_norm": 0.146484375, "learning_rate": 0.1, "loss": 2.5044050216674805, "step": 11236 }, { "epoch": 0.3567619047619048, "grad_norm": 0.09521484375, "learning_rate": 0.1, "loss": 2.4619669914245605, "step": 11238 }, { "epoch": 0.3568253968253968, "grad_norm": 0.080078125, "learning_rate": 0.1, "loss": 2.4584763050079346, "step": 11240 }, { "epoch": 0.35688888888888887, "grad_norm": 0.197265625, "learning_rate": 0.1, "loss": 2.478175640106201, "step": 11242 }, { "epoch": 0.35695238095238097, "grad_norm": 0.3515625, "learning_rate": 0.1, "loss": 2.476935386657715, "step": 11244 }, { "epoch": 0.357015873015873, "grad_norm": 0.244140625, "learning_rate": 0.1, "loss": 2.461888074874878, "step": 11246 }, { "epoch": 0.35707936507936505, "grad_norm": 0.1142578125, "learning_rate": 0.1, "loss": 2.449694871902466, "step": 11248 }, { "epoch": 0.35714285714285715, "grad_norm": 0.244140625, "learning_rate": 0.1, "loss": 2.4589521884918213, "step": 11250 }, { "epoch": 0.3572063492063492, "grad_norm": 0.169921875, "learning_rate": 0.1, "loss": 2.469026565551758, "step": 11252 }, { "epoch": 0.3572698412698413, "grad_norm": 0.2294921875, "learning_rate": 0.1, "loss": 2.463324785232544, "step": 11254 }, { "epoch": 0.35733333333333334, "grad_norm": 0.37109375, "learning_rate": 0.1, "loss": 2.450981616973877, "step": 11256 }, { "epoch": 0.3573968253968254, "grad_norm": 0.1572265625, "learning_rate": 0.1, "loss": 2.4704065322875977, "step": 11258 }, { "epoch": 0.3574603174603175, "grad_norm": 0.10400390625, "learning_rate": 0.1, "loss": 2.4855854511260986, "step": 11260 }, { "epoch": 0.3575238095238095, "grad_norm": 0.091796875, "learning_rate": 0.1, "loss": 2.459942102432251, "step": 11262 }, { "epoch": 0.35758730158730156, "grad_norm": 0.365234375, "learning_rate": 0.1, "loss": 2.4716644287109375, "step": 11264 }, { "epoch": 0.35765079365079366, "grad_norm": 0.138671875, "learning_rate": 0.1, "loss": 2.4955148696899414, "step": 11266 }, { "epoch": 0.3577142857142857, "grad_norm": 0.076171875, "learning_rate": 0.1, "loss": 2.462697982788086, "step": 11268 }, { "epoch": 0.35777777777777775, "grad_norm": 0.1396484375, "learning_rate": 0.1, "loss": 2.4656829833984375, "step": 11270 }, { "epoch": 0.35784126984126985, "grad_norm": 0.09423828125, "learning_rate": 0.1, "loss": 2.458569288253784, "step": 11272 }, { "epoch": 0.3579047619047619, "grad_norm": 0.0771484375, "learning_rate": 0.1, "loss": 2.4550700187683105, "step": 11274 }, { "epoch": 0.357968253968254, "grad_norm": 0.1142578125, "learning_rate": 0.1, "loss": 2.4642810821533203, "step": 11276 }, { "epoch": 0.35803174603174603, "grad_norm": 0.1826171875, "learning_rate": 0.1, "loss": 2.454416513442993, "step": 11278 }, { "epoch": 0.3580952380952381, "grad_norm": 0.17578125, "learning_rate": 0.1, "loss": 2.469170570373535, "step": 11280 }, { "epoch": 0.3581587301587302, "grad_norm": 0.17578125, "learning_rate": 0.1, "loss": 2.481339693069458, "step": 11282 }, { "epoch": 0.3582222222222222, "grad_norm": 0.458984375, "learning_rate": 0.1, "loss": 2.467975378036499, "step": 11284 }, { "epoch": 0.35828571428571426, "grad_norm": 0.2080078125, "learning_rate": 0.1, "loss": 2.5128746032714844, "step": 11286 }, { "epoch": 0.35834920634920636, "grad_norm": 0.08154296875, "learning_rate": 0.1, "loss": 2.4856278896331787, "step": 11288 }, { "epoch": 0.3584126984126984, "grad_norm": 0.1044921875, "learning_rate": 0.1, "loss": 2.4855496883392334, "step": 11290 }, { "epoch": 0.3584761904761905, "grad_norm": 0.28125, "learning_rate": 0.1, "loss": 2.4849109649658203, "step": 11292 }, { "epoch": 0.35853968253968255, "grad_norm": 0.2890625, "learning_rate": 0.1, "loss": 2.4842143058776855, "step": 11294 }, { "epoch": 0.3586031746031746, "grad_norm": 0.3671875, "learning_rate": 0.1, "loss": 2.451904058456421, "step": 11296 }, { "epoch": 0.3586666666666667, "grad_norm": 0.1884765625, "learning_rate": 0.1, "loss": 2.4747142791748047, "step": 11298 }, { "epoch": 0.35873015873015873, "grad_norm": 0.2294921875, "learning_rate": 0.1, "loss": 2.4832489490509033, "step": 11300 }, { "epoch": 0.3587936507936508, "grad_norm": 0.1484375, "learning_rate": 0.1, "loss": 2.4741103649139404, "step": 11302 }, { "epoch": 0.3588571428571429, "grad_norm": 0.099609375, "learning_rate": 0.1, "loss": 2.470633029937744, "step": 11304 }, { "epoch": 0.3589206349206349, "grad_norm": 0.228515625, "learning_rate": 0.1, "loss": 2.491023063659668, "step": 11306 }, { "epoch": 0.35898412698412696, "grad_norm": 0.2578125, "learning_rate": 0.1, "loss": 2.4867446422576904, "step": 11308 }, { "epoch": 0.35904761904761906, "grad_norm": 0.1875, "learning_rate": 0.1, "loss": 2.4757778644561768, "step": 11310 }, { "epoch": 0.3591111111111111, "grad_norm": 0.236328125, "learning_rate": 0.1, "loss": 2.483226776123047, "step": 11312 }, { "epoch": 0.3591746031746032, "grad_norm": 0.291015625, "learning_rate": 0.1, "loss": 2.509500026702881, "step": 11314 }, { "epoch": 0.35923809523809525, "grad_norm": 0.1064453125, "learning_rate": 0.1, "loss": 2.4560673236846924, "step": 11316 }, { "epoch": 0.3593015873015873, "grad_norm": 0.35546875, "learning_rate": 0.1, "loss": 2.4755465984344482, "step": 11318 }, { "epoch": 0.3593650793650794, "grad_norm": 0.390625, "learning_rate": 0.1, "loss": 2.4875471591949463, "step": 11320 }, { "epoch": 0.35942857142857143, "grad_norm": 0.1376953125, "learning_rate": 0.1, "loss": 2.501183032989502, "step": 11322 }, { "epoch": 0.3594920634920635, "grad_norm": 0.1708984375, "learning_rate": 0.1, "loss": 2.482649087905884, "step": 11324 }, { "epoch": 0.3595555555555556, "grad_norm": 0.33984375, "learning_rate": 0.1, "loss": 2.5185811519622803, "step": 11326 }, { "epoch": 0.3596190476190476, "grad_norm": 0.189453125, "learning_rate": 0.1, "loss": 2.5071229934692383, "step": 11328 }, { "epoch": 0.35968253968253966, "grad_norm": 0.1708984375, "learning_rate": 0.1, "loss": 2.4960479736328125, "step": 11330 }, { "epoch": 0.35974603174603176, "grad_norm": 0.0732421875, "learning_rate": 0.1, "loss": 2.4599719047546387, "step": 11332 }, { "epoch": 0.3598095238095238, "grad_norm": 0.0712890625, "learning_rate": 0.1, "loss": 2.492602586746216, "step": 11334 }, { "epoch": 0.3598730158730159, "grad_norm": 0.0615234375, "learning_rate": 0.1, "loss": 2.467595100402832, "step": 11336 }, { "epoch": 0.35993650793650794, "grad_norm": 0.0966796875, "learning_rate": 0.1, "loss": 2.5092246532440186, "step": 11338 }, { "epoch": 0.36, "grad_norm": 0.2373046875, "learning_rate": 0.1, "loss": 2.4747467041015625, "step": 11340 }, { "epoch": 0.3600634920634921, "grad_norm": 0.318359375, "learning_rate": 0.1, "loss": 2.5079941749572754, "step": 11342 }, { "epoch": 0.36012698412698413, "grad_norm": 0.10498046875, "learning_rate": 0.1, "loss": 2.453760862350464, "step": 11344 }, { "epoch": 0.36019047619047617, "grad_norm": 0.255859375, "learning_rate": 0.1, "loss": 2.4873969554901123, "step": 11346 }, { "epoch": 0.36025396825396827, "grad_norm": 0.443359375, "learning_rate": 0.1, "loss": 2.510312080383301, "step": 11348 }, { "epoch": 0.3603174603174603, "grad_norm": 0.1875, "learning_rate": 0.1, "loss": 2.477294683456421, "step": 11350 }, { "epoch": 0.36038095238095236, "grad_norm": 0.1328125, "learning_rate": 0.1, "loss": 2.4856131076812744, "step": 11352 }, { "epoch": 0.36044444444444446, "grad_norm": 0.16796875, "learning_rate": 0.1, "loss": 2.488382577896118, "step": 11354 }, { "epoch": 0.3605079365079365, "grad_norm": 0.1826171875, "learning_rate": 0.1, "loss": 2.477754831314087, "step": 11356 }, { "epoch": 0.3605714285714286, "grad_norm": 0.423828125, "learning_rate": 0.1, "loss": 2.481013774871826, "step": 11358 }, { "epoch": 0.36063492063492064, "grad_norm": 0.484375, "learning_rate": 0.1, "loss": 2.490089178085327, "step": 11360 }, { "epoch": 0.3606984126984127, "grad_norm": 0.07958984375, "learning_rate": 0.1, "loss": 2.5008773803710938, "step": 11362 }, { "epoch": 0.3607619047619048, "grad_norm": 0.232421875, "learning_rate": 0.1, "loss": 2.492215871810913, "step": 11364 }, { "epoch": 0.3608253968253968, "grad_norm": 0.345703125, "learning_rate": 0.1, "loss": 2.4556965827941895, "step": 11366 }, { "epoch": 0.36088888888888887, "grad_norm": 0.294921875, "learning_rate": 0.1, "loss": 2.5121169090270996, "step": 11368 }, { "epoch": 0.36095238095238097, "grad_norm": 0.201171875, "learning_rate": 0.1, "loss": 2.476003885269165, "step": 11370 }, { "epoch": 0.361015873015873, "grad_norm": 0.14453125, "learning_rate": 0.1, "loss": 2.4960570335388184, "step": 11372 }, { "epoch": 0.36107936507936506, "grad_norm": 0.130859375, "learning_rate": 0.1, "loss": 2.501624822616577, "step": 11374 }, { "epoch": 0.36114285714285715, "grad_norm": 0.12060546875, "learning_rate": 0.1, "loss": 2.5003836154937744, "step": 11376 }, { "epoch": 0.3612063492063492, "grad_norm": 0.2333984375, "learning_rate": 0.1, "loss": 2.4826653003692627, "step": 11378 }, { "epoch": 0.3612698412698413, "grad_norm": 0.09228515625, "learning_rate": 0.1, "loss": 2.495271682739258, "step": 11380 }, { "epoch": 0.36133333333333334, "grad_norm": 0.0830078125, "learning_rate": 0.1, "loss": 2.492497205734253, "step": 11382 }, { "epoch": 0.3613968253968254, "grad_norm": 0.07958984375, "learning_rate": 0.1, "loss": 2.495521306991577, "step": 11384 }, { "epoch": 0.3614603174603175, "grad_norm": 0.25390625, "learning_rate": 0.1, "loss": 2.488354444503784, "step": 11386 }, { "epoch": 0.3615238095238095, "grad_norm": 0.162109375, "learning_rate": 0.1, "loss": 2.4690821170806885, "step": 11388 }, { "epoch": 0.36158730158730157, "grad_norm": 0.1728515625, "learning_rate": 0.1, "loss": 2.473665475845337, "step": 11390 }, { "epoch": 0.36165079365079367, "grad_norm": 0.12255859375, "learning_rate": 0.1, "loss": 2.507917642593384, "step": 11392 }, { "epoch": 0.3617142857142857, "grad_norm": 0.166015625, "learning_rate": 0.1, "loss": 2.496009588241577, "step": 11394 }, { "epoch": 0.36177777777777775, "grad_norm": 0.4140625, "learning_rate": 0.1, "loss": 2.498739719390869, "step": 11396 }, { "epoch": 0.36184126984126985, "grad_norm": 0.353515625, "learning_rate": 0.1, "loss": 2.4704389572143555, "step": 11398 }, { "epoch": 0.3619047619047619, "grad_norm": 0.130859375, "learning_rate": 0.1, "loss": 2.4971871376037598, "step": 11400 }, { "epoch": 0.361968253968254, "grad_norm": 0.10205078125, "learning_rate": 0.1, "loss": 2.4943349361419678, "step": 11402 }, { "epoch": 0.36203174603174604, "grad_norm": 0.255859375, "learning_rate": 0.1, "loss": 2.4700849056243896, "step": 11404 }, { "epoch": 0.3620952380952381, "grad_norm": 0.3125, "learning_rate": 0.1, "loss": 2.4819083213806152, "step": 11406 }, { "epoch": 0.3621587301587302, "grad_norm": 0.193359375, "learning_rate": 0.1, "loss": 2.4707014560699463, "step": 11408 }, { "epoch": 0.3622222222222222, "grad_norm": 0.09228515625, "learning_rate": 0.1, "loss": 2.478113889694214, "step": 11410 }, { "epoch": 0.36228571428571427, "grad_norm": 0.1318359375, "learning_rate": 0.1, "loss": 2.5012660026550293, "step": 11412 }, { "epoch": 0.36234920634920637, "grad_norm": 0.140625, "learning_rate": 0.1, "loss": 2.5109262466430664, "step": 11414 }, { "epoch": 0.3624126984126984, "grad_norm": 0.55078125, "learning_rate": 0.1, "loss": 2.4876835346221924, "step": 11416 }, { "epoch": 0.36247619047619045, "grad_norm": 0.3125, "learning_rate": 0.1, "loss": 2.488044023513794, "step": 11418 }, { "epoch": 0.36253968253968255, "grad_norm": 0.1337890625, "learning_rate": 0.1, "loss": 2.474841594696045, "step": 11420 }, { "epoch": 0.3626031746031746, "grad_norm": 0.2275390625, "learning_rate": 0.1, "loss": 2.457033157348633, "step": 11422 }, { "epoch": 0.3626666666666667, "grad_norm": 0.181640625, "learning_rate": 0.1, "loss": 2.4880568981170654, "step": 11424 }, { "epoch": 0.36273015873015874, "grad_norm": 0.1171875, "learning_rate": 0.1, "loss": 2.4715211391448975, "step": 11426 }, { "epoch": 0.3627936507936508, "grad_norm": 0.13671875, "learning_rate": 0.1, "loss": 2.4934537410736084, "step": 11428 }, { "epoch": 0.3628571428571429, "grad_norm": 0.294921875, "learning_rate": 0.1, "loss": 2.498490333557129, "step": 11430 }, { "epoch": 0.3629206349206349, "grad_norm": 0.498046875, "learning_rate": 0.1, "loss": 2.517286777496338, "step": 11432 }, { "epoch": 0.36298412698412696, "grad_norm": 0.06591796875, "learning_rate": 0.1, "loss": 2.502140760421753, "step": 11434 }, { "epoch": 0.36304761904761906, "grad_norm": 0.06396484375, "learning_rate": 0.1, "loss": 2.482360601425171, "step": 11436 }, { "epoch": 0.3631111111111111, "grad_norm": 0.064453125, "learning_rate": 0.1, "loss": 2.470284938812256, "step": 11438 }, { "epoch": 0.36317460317460315, "grad_norm": 0.109375, "learning_rate": 0.1, "loss": 2.4868221282958984, "step": 11440 }, { "epoch": 0.36323809523809525, "grad_norm": 0.2216796875, "learning_rate": 0.1, "loss": 2.4566707611083984, "step": 11442 }, { "epoch": 0.3633015873015873, "grad_norm": 0.3125, "learning_rate": 0.1, "loss": 2.509664297103882, "step": 11444 }, { "epoch": 0.3633650793650794, "grad_norm": 0.061767578125, "learning_rate": 0.1, "loss": 2.4820139408111572, "step": 11446 }, { "epoch": 0.36342857142857143, "grad_norm": 0.11474609375, "learning_rate": 0.1, "loss": 2.500307559967041, "step": 11448 }, { "epoch": 0.3634920634920635, "grad_norm": 0.251953125, "learning_rate": 0.1, "loss": 2.495778799057007, "step": 11450 }, { "epoch": 0.3635555555555556, "grad_norm": 0.189453125, "learning_rate": 0.1, "loss": 2.4892730712890625, "step": 11452 }, { "epoch": 0.3636190476190476, "grad_norm": 0.5546875, "learning_rate": 0.1, "loss": 2.4844303131103516, "step": 11454 }, { "epoch": 0.36368253968253966, "grad_norm": 0.369140625, "learning_rate": 0.1, "loss": 2.50929856300354, "step": 11456 }, { "epoch": 0.36374603174603176, "grad_norm": 0.0859375, "learning_rate": 0.1, "loss": 2.4976625442504883, "step": 11458 }, { "epoch": 0.3638095238095238, "grad_norm": 0.076171875, "learning_rate": 0.1, "loss": 2.495049238204956, "step": 11460 }, { "epoch": 0.36387301587301585, "grad_norm": 0.1552734375, "learning_rate": 0.1, "loss": 2.5226457118988037, "step": 11462 }, { "epoch": 0.36393650793650795, "grad_norm": 0.1953125, "learning_rate": 0.1, "loss": 2.4973907470703125, "step": 11464 }, { "epoch": 0.364, "grad_norm": 0.130859375, "learning_rate": 0.1, "loss": 2.4904561042785645, "step": 11466 }, { "epoch": 0.3640634920634921, "grad_norm": 0.060302734375, "learning_rate": 0.1, "loss": 2.470416784286499, "step": 11468 }, { "epoch": 0.36412698412698413, "grad_norm": 0.2431640625, "learning_rate": 0.1, "loss": 2.502138137817383, "step": 11470 }, { "epoch": 0.3641904761904762, "grad_norm": 0.490234375, "learning_rate": 0.1, "loss": 2.489926338195801, "step": 11472 }, { "epoch": 0.3642539682539683, "grad_norm": 0.2333984375, "learning_rate": 0.1, "loss": 2.488893747329712, "step": 11474 }, { "epoch": 0.3643174603174603, "grad_norm": 0.1689453125, "learning_rate": 0.1, "loss": 2.5296382904052734, "step": 11476 }, { "epoch": 0.36438095238095236, "grad_norm": 0.14453125, "learning_rate": 0.1, "loss": 2.469823122024536, "step": 11478 }, { "epoch": 0.36444444444444446, "grad_norm": 0.10498046875, "learning_rate": 0.1, "loss": 2.4645936489105225, "step": 11480 }, { "epoch": 0.3645079365079365, "grad_norm": 0.12353515625, "learning_rate": 0.1, "loss": 2.4924604892730713, "step": 11482 }, { "epoch": 0.36457142857142855, "grad_norm": 0.06787109375, "learning_rate": 0.1, "loss": 2.5029876232147217, "step": 11484 }, { "epoch": 0.36463492063492065, "grad_norm": 0.3125, "learning_rate": 0.1, "loss": 2.523407220840454, "step": 11486 }, { "epoch": 0.3646984126984127, "grad_norm": 0.357421875, "learning_rate": 0.1, "loss": 2.4972376823425293, "step": 11488 }, { "epoch": 0.3647619047619048, "grad_norm": 0.19921875, "learning_rate": 0.1, "loss": 2.462998151779175, "step": 11490 }, { "epoch": 0.36482539682539683, "grad_norm": 0.10791015625, "learning_rate": 0.1, "loss": 2.500276803970337, "step": 11492 }, { "epoch": 0.3648888888888889, "grad_norm": 0.0927734375, "learning_rate": 0.1, "loss": 2.4935381412506104, "step": 11494 }, { "epoch": 0.364952380952381, "grad_norm": 0.1591796875, "learning_rate": 0.1, "loss": 2.4755890369415283, "step": 11496 }, { "epoch": 0.365015873015873, "grad_norm": 0.1591796875, "learning_rate": 0.1, "loss": 2.501697301864624, "step": 11498 }, { "epoch": 0.36507936507936506, "grad_norm": 0.48046875, "learning_rate": 0.1, "loss": 2.5090396404266357, "step": 11500 }, { "epoch": 0.36514285714285716, "grad_norm": 0.0908203125, "learning_rate": 0.1, "loss": 2.4752163887023926, "step": 11502 }, { "epoch": 0.3652063492063492, "grad_norm": 0.1787109375, "learning_rate": 0.1, "loss": 2.4872875213623047, "step": 11504 }, { "epoch": 0.36526984126984124, "grad_norm": 0.068359375, "learning_rate": 0.1, "loss": 2.4730489253997803, "step": 11506 }, { "epoch": 0.36533333333333334, "grad_norm": 0.373046875, "learning_rate": 0.1, "loss": 2.4886996746063232, "step": 11508 }, { "epoch": 0.3653968253968254, "grad_norm": 0.35546875, "learning_rate": 0.1, "loss": 2.476940870285034, "step": 11510 }, { "epoch": 0.3654603174603175, "grad_norm": 0.1484375, "learning_rate": 0.1, "loss": 2.4784579277038574, "step": 11512 }, { "epoch": 0.36552380952380953, "grad_norm": 0.10009765625, "learning_rate": 0.1, "loss": 2.4905340671539307, "step": 11514 }, { "epoch": 0.36558730158730157, "grad_norm": 0.1669921875, "learning_rate": 0.1, "loss": 2.4924633502960205, "step": 11516 }, { "epoch": 0.36565079365079367, "grad_norm": 0.06591796875, "learning_rate": 0.1, "loss": 2.459251642227173, "step": 11518 }, { "epoch": 0.3657142857142857, "grad_norm": 0.0791015625, "learning_rate": 0.1, "loss": 2.4829344749450684, "step": 11520 }, { "epoch": 0.36577777777777776, "grad_norm": 0.15234375, "learning_rate": 0.1, "loss": 2.5123536586761475, "step": 11522 }, { "epoch": 0.36584126984126986, "grad_norm": 0.1220703125, "learning_rate": 0.1, "loss": 2.4925343990325928, "step": 11524 }, { "epoch": 0.3659047619047619, "grad_norm": 0.220703125, "learning_rate": 0.1, "loss": 2.463898181915283, "step": 11526 }, { "epoch": 0.36596825396825394, "grad_norm": 0.1728515625, "learning_rate": 0.1, "loss": 2.4763333797454834, "step": 11528 }, { "epoch": 0.36603174603174604, "grad_norm": 0.353515625, "learning_rate": 0.1, "loss": 2.515787124633789, "step": 11530 }, { "epoch": 0.3660952380952381, "grad_norm": 0.361328125, "learning_rate": 0.1, "loss": 2.4792990684509277, "step": 11532 }, { "epoch": 0.3661587301587302, "grad_norm": 0.11572265625, "learning_rate": 0.1, "loss": 2.487999200820923, "step": 11534 }, { "epoch": 0.3662222222222222, "grad_norm": 0.0966796875, "learning_rate": 0.1, "loss": 2.4968855381011963, "step": 11536 }, { "epoch": 0.36628571428571427, "grad_norm": 0.1611328125, "learning_rate": 0.1, "loss": 2.4591152667999268, "step": 11538 }, { "epoch": 0.36634920634920637, "grad_norm": 0.166015625, "learning_rate": 0.1, "loss": 2.4713194370269775, "step": 11540 }, { "epoch": 0.3664126984126984, "grad_norm": 0.259765625, "learning_rate": 0.1, "loss": 2.4807326793670654, "step": 11542 }, { "epoch": 0.36647619047619046, "grad_norm": 0.17578125, "learning_rate": 0.1, "loss": 2.4585866928100586, "step": 11544 }, { "epoch": 0.36653968253968255, "grad_norm": 0.28125, "learning_rate": 0.1, "loss": 2.4734954833984375, "step": 11546 }, { "epoch": 0.3666031746031746, "grad_norm": 0.2197265625, "learning_rate": 0.1, "loss": 2.5005085468292236, "step": 11548 }, { "epoch": 0.36666666666666664, "grad_norm": 0.365234375, "learning_rate": 0.1, "loss": 2.489720582962036, "step": 11550 }, { "epoch": 0.36673015873015874, "grad_norm": 0.435546875, "learning_rate": 0.1, "loss": 2.4803593158721924, "step": 11552 }, { "epoch": 0.3667936507936508, "grad_norm": 0.263671875, "learning_rate": 0.1, "loss": 2.4626052379608154, "step": 11554 }, { "epoch": 0.3668571428571429, "grad_norm": 0.09521484375, "learning_rate": 0.1, "loss": 2.512833833694458, "step": 11556 }, { "epoch": 0.3669206349206349, "grad_norm": 0.1416015625, "learning_rate": 0.1, "loss": 2.4813570976257324, "step": 11558 }, { "epoch": 0.36698412698412697, "grad_norm": 0.181640625, "learning_rate": 0.1, "loss": 2.470029354095459, "step": 11560 }, { "epoch": 0.36704761904761907, "grad_norm": 0.1494140625, "learning_rate": 0.1, "loss": 2.4803454875946045, "step": 11562 }, { "epoch": 0.3671111111111111, "grad_norm": 0.275390625, "learning_rate": 0.1, "loss": 2.481099843978882, "step": 11564 }, { "epoch": 0.36717460317460315, "grad_norm": 0.10009765625, "learning_rate": 0.1, "loss": 2.4896509647369385, "step": 11566 }, { "epoch": 0.36723809523809525, "grad_norm": 0.1142578125, "learning_rate": 0.1, "loss": 2.4700539112091064, "step": 11568 }, { "epoch": 0.3673015873015873, "grad_norm": 0.130859375, "learning_rate": 0.1, "loss": 2.4857864379882812, "step": 11570 }, { "epoch": 0.36736507936507934, "grad_norm": 0.06787109375, "learning_rate": 0.1, "loss": 2.4712841510772705, "step": 11572 }, { "epoch": 0.36742857142857144, "grad_norm": 0.265625, "learning_rate": 0.1, "loss": 2.4903788566589355, "step": 11574 }, { "epoch": 0.3674920634920635, "grad_norm": 0.427734375, "learning_rate": 0.1, "loss": 2.4827778339385986, "step": 11576 }, { "epoch": 0.3675555555555556, "grad_norm": 0.06298828125, "learning_rate": 0.1, "loss": 2.481940269470215, "step": 11578 }, { "epoch": 0.3676190476190476, "grad_norm": 0.130859375, "learning_rate": 0.1, "loss": 2.488921642303467, "step": 11580 }, { "epoch": 0.36768253968253967, "grad_norm": 0.12255859375, "learning_rate": 0.1, "loss": 2.487088918685913, "step": 11582 }, { "epoch": 0.36774603174603177, "grad_norm": 0.1513671875, "learning_rate": 0.1, "loss": 2.476516008377075, "step": 11584 }, { "epoch": 0.3678095238095238, "grad_norm": 0.1845703125, "learning_rate": 0.1, "loss": 2.4968202114105225, "step": 11586 }, { "epoch": 0.36787301587301585, "grad_norm": 0.0869140625, "learning_rate": 0.1, "loss": 2.4828453063964844, "step": 11588 }, { "epoch": 0.36793650793650795, "grad_norm": 0.2451171875, "learning_rate": 0.1, "loss": 2.4897444248199463, "step": 11590 }, { "epoch": 0.368, "grad_norm": 0.443359375, "learning_rate": 0.1, "loss": 2.4958691596984863, "step": 11592 }, { "epoch": 0.36806349206349204, "grad_norm": 0.35546875, "learning_rate": 0.1, "loss": 2.4981982707977295, "step": 11594 }, { "epoch": 0.36812698412698414, "grad_norm": 0.1806640625, "learning_rate": 0.1, "loss": 2.4794466495513916, "step": 11596 }, { "epoch": 0.3681904761904762, "grad_norm": 0.205078125, "learning_rate": 0.1, "loss": 2.477478265762329, "step": 11598 }, { "epoch": 0.3682539682539683, "grad_norm": 0.2392578125, "learning_rate": 0.1, "loss": 2.4623732566833496, "step": 11600 }, { "epoch": 0.3683174603174603, "grad_norm": 0.1689453125, "learning_rate": 0.1, "loss": 2.475332498550415, "step": 11602 }, { "epoch": 0.36838095238095236, "grad_norm": 0.1904296875, "learning_rate": 0.1, "loss": 2.4902873039245605, "step": 11604 }, { "epoch": 0.36844444444444446, "grad_norm": 0.3046875, "learning_rate": 0.1, "loss": 2.5031120777130127, "step": 11606 }, { "epoch": 0.3685079365079365, "grad_norm": 0.16015625, "learning_rate": 0.1, "loss": 2.4824001789093018, "step": 11608 }, { "epoch": 0.36857142857142855, "grad_norm": 0.08203125, "learning_rate": 0.1, "loss": 2.507827043533325, "step": 11610 }, { "epoch": 0.36863492063492065, "grad_norm": 0.296875, "learning_rate": 0.1, "loss": 2.4686498641967773, "step": 11612 }, { "epoch": 0.3686984126984127, "grad_norm": 0.50390625, "learning_rate": 0.1, "loss": 2.4797205924987793, "step": 11614 }, { "epoch": 0.36876190476190474, "grad_norm": 0.287109375, "learning_rate": 0.1, "loss": 2.5164663791656494, "step": 11616 }, { "epoch": 0.36882539682539683, "grad_norm": 0.09716796875, "learning_rate": 0.1, "loss": 2.491569757461548, "step": 11618 }, { "epoch": 0.3688888888888889, "grad_norm": 0.109375, "learning_rate": 0.1, "loss": 2.48038911819458, "step": 11620 }, { "epoch": 0.368952380952381, "grad_norm": 0.0576171875, "learning_rate": 0.1, "loss": 2.4934587478637695, "step": 11622 }, { "epoch": 0.369015873015873, "grad_norm": 0.056884765625, "learning_rate": 0.1, "loss": 2.460035800933838, "step": 11624 }, { "epoch": 0.36907936507936506, "grad_norm": 0.224609375, "learning_rate": 0.1, "loss": 2.473371982574463, "step": 11626 }, { "epoch": 0.36914285714285716, "grad_norm": 0.078125, "learning_rate": 0.1, "loss": 2.4780993461608887, "step": 11628 }, { "epoch": 0.3692063492063492, "grad_norm": 0.08740234375, "learning_rate": 0.1, "loss": 2.466709613800049, "step": 11630 }, { "epoch": 0.36926984126984125, "grad_norm": 0.12890625, "learning_rate": 0.1, "loss": 2.462557792663574, "step": 11632 }, { "epoch": 0.36933333333333335, "grad_norm": 0.08740234375, "learning_rate": 0.1, "loss": 2.4835760593414307, "step": 11634 }, { "epoch": 0.3693968253968254, "grad_norm": 0.09130859375, "learning_rate": 0.1, "loss": 2.4916019439697266, "step": 11636 }, { "epoch": 0.36946031746031743, "grad_norm": 0.1982421875, "learning_rate": 0.1, "loss": 2.480407476425171, "step": 11638 }, { "epoch": 0.36952380952380953, "grad_norm": 0.26171875, "learning_rate": 0.1, "loss": 2.482396364212036, "step": 11640 }, { "epoch": 0.3695873015873016, "grad_norm": 0.30078125, "learning_rate": 0.1, "loss": 2.4516971111297607, "step": 11642 }, { "epoch": 0.3696507936507937, "grad_norm": 0.490234375, "learning_rate": 0.1, "loss": 2.466508150100708, "step": 11644 }, { "epoch": 0.3697142857142857, "grad_norm": 0.169921875, "learning_rate": 0.1, "loss": 2.4519338607788086, "step": 11646 }, { "epoch": 0.36977777777777776, "grad_norm": 0.06005859375, "learning_rate": 0.1, "loss": 2.4561171531677246, "step": 11648 }, { "epoch": 0.36984126984126986, "grad_norm": 0.1748046875, "learning_rate": 0.1, "loss": 2.4761106967926025, "step": 11650 }, { "epoch": 0.3699047619047619, "grad_norm": 0.322265625, "learning_rate": 0.1, "loss": 2.445469856262207, "step": 11652 }, { "epoch": 0.36996825396825395, "grad_norm": 0.296875, "learning_rate": 0.1, "loss": 2.473306179046631, "step": 11654 }, { "epoch": 0.37003174603174604, "grad_norm": 0.31640625, "learning_rate": 0.1, "loss": 2.4608564376831055, "step": 11656 }, { "epoch": 0.3700952380952381, "grad_norm": 0.23046875, "learning_rate": 0.1, "loss": 2.4572176933288574, "step": 11658 }, { "epoch": 0.37015873015873013, "grad_norm": 0.2373046875, "learning_rate": 0.1, "loss": 2.5033202171325684, "step": 11660 }, { "epoch": 0.37022222222222223, "grad_norm": 0.232421875, "learning_rate": 0.1, "loss": 2.4684784412384033, "step": 11662 }, { "epoch": 0.3702857142857143, "grad_norm": 0.140625, "learning_rate": 0.1, "loss": 2.4543795585632324, "step": 11664 }, { "epoch": 0.3703492063492064, "grad_norm": 0.2412109375, "learning_rate": 0.1, "loss": 2.4649386405944824, "step": 11666 }, { "epoch": 0.3704126984126984, "grad_norm": 0.1318359375, "learning_rate": 0.1, "loss": 2.442139148712158, "step": 11668 }, { "epoch": 0.37047619047619046, "grad_norm": 0.15234375, "learning_rate": 0.1, "loss": 2.467223644256592, "step": 11670 }, { "epoch": 0.37053968253968256, "grad_norm": 0.1025390625, "learning_rate": 0.1, "loss": 2.484752655029297, "step": 11672 }, { "epoch": 0.3706031746031746, "grad_norm": 0.1669921875, "learning_rate": 0.1, "loss": 2.461017370223999, "step": 11674 }, { "epoch": 0.37066666666666664, "grad_norm": 0.2197265625, "learning_rate": 0.1, "loss": 2.465242624282837, "step": 11676 }, { "epoch": 0.37073015873015874, "grad_norm": 0.2421875, "learning_rate": 0.1, "loss": 2.466881513595581, "step": 11678 }, { "epoch": 0.3707936507936508, "grad_norm": 0.18359375, "learning_rate": 0.1, "loss": 2.4572830200195312, "step": 11680 }, { "epoch": 0.37085714285714283, "grad_norm": 0.275390625, "learning_rate": 0.1, "loss": 2.473813772201538, "step": 11682 }, { "epoch": 0.37092063492063493, "grad_norm": 0.306640625, "learning_rate": 0.1, "loss": 2.466486930847168, "step": 11684 }, { "epoch": 0.37098412698412697, "grad_norm": 0.353515625, "learning_rate": 0.1, "loss": 2.4515933990478516, "step": 11686 }, { "epoch": 0.37104761904761907, "grad_norm": 0.2431640625, "learning_rate": 0.1, "loss": 2.45233416557312, "step": 11688 }, { "epoch": 0.3711111111111111, "grad_norm": 0.162109375, "learning_rate": 0.1, "loss": 2.4697437286376953, "step": 11690 }, { "epoch": 0.37117460317460316, "grad_norm": 0.12109375, "learning_rate": 0.1, "loss": 2.4969775676727295, "step": 11692 }, { "epoch": 0.37123809523809526, "grad_norm": 0.07861328125, "learning_rate": 0.1, "loss": 2.4909560680389404, "step": 11694 }, { "epoch": 0.3713015873015873, "grad_norm": 0.09619140625, "learning_rate": 0.1, "loss": 2.466994285583496, "step": 11696 }, { "epoch": 0.37136507936507934, "grad_norm": 0.154296875, "learning_rate": 0.1, "loss": 2.456789493560791, "step": 11698 }, { "epoch": 0.37142857142857144, "grad_norm": 0.056396484375, "learning_rate": 0.1, "loss": 2.4662811756134033, "step": 11700 }, { "epoch": 0.3714920634920635, "grad_norm": 0.16796875, "learning_rate": 0.1, "loss": 2.4680871963500977, "step": 11702 }, { "epoch": 0.37155555555555553, "grad_norm": 0.75390625, "learning_rate": 0.1, "loss": 2.481945037841797, "step": 11704 }, { "epoch": 0.3716190476190476, "grad_norm": 0.2421875, "learning_rate": 0.1, "loss": 2.465639591217041, "step": 11706 }, { "epoch": 0.37168253968253967, "grad_norm": 0.10498046875, "learning_rate": 0.1, "loss": 2.4653689861297607, "step": 11708 }, { "epoch": 0.37174603174603177, "grad_norm": 0.1171875, "learning_rate": 0.1, "loss": 2.49106764793396, "step": 11710 }, { "epoch": 0.3718095238095238, "grad_norm": 0.21875, "learning_rate": 0.1, "loss": 2.4726955890655518, "step": 11712 }, { "epoch": 0.37187301587301586, "grad_norm": 0.2333984375, "learning_rate": 0.1, "loss": 2.4753713607788086, "step": 11714 }, { "epoch": 0.37193650793650795, "grad_norm": 0.09716796875, "learning_rate": 0.1, "loss": 2.4738142490386963, "step": 11716 }, { "epoch": 0.372, "grad_norm": 0.2255859375, "learning_rate": 0.1, "loss": 2.469127893447876, "step": 11718 }, { "epoch": 0.37206349206349204, "grad_norm": 0.271484375, "learning_rate": 0.1, "loss": 2.498310089111328, "step": 11720 }, { "epoch": 0.37212698412698414, "grad_norm": 0.1806640625, "learning_rate": 0.1, "loss": 2.4911274909973145, "step": 11722 }, { "epoch": 0.3721904761904762, "grad_norm": 0.1640625, "learning_rate": 0.1, "loss": 2.496417760848999, "step": 11724 }, { "epoch": 0.3722539682539683, "grad_norm": 0.146484375, "learning_rate": 0.1, "loss": 2.491314172744751, "step": 11726 }, { "epoch": 0.3723174603174603, "grad_norm": 0.11669921875, "learning_rate": 0.1, "loss": 2.4768242835998535, "step": 11728 }, { "epoch": 0.37238095238095237, "grad_norm": 0.0810546875, "learning_rate": 0.1, "loss": 2.4933431148529053, "step": 11730 }, { "epoch": 0.37244444444444447, "grad_norm": 0.08642578125, "learning_rate": 0.1, "loss": 2.504645347595215, "step": 11732 }, { "epoch": 0.3725079365079365, "grad_norm": 0.15625, "learning_rate": 0.1, "loss": 2.4549572467803955, "step": 11734 }, { "epoch": 0.37257142857142855, "grad_norm": 0.328125, "learning_rate": 0.1, "loss": 2.499910354614258, "step": 11736 }, { "epoch": 0.37263492063492065, "grad_norm": 0.671875, "learning_rate": 0.1, "loss": 2.474417209625244, "step": 11738 }, { "epoch": 0.3726984126984127, "grad_norm": 0.1435546875, "learning_rate": 0.1, "loss": 2.48007869720459, "step": 11740 }, { "epoch": 0.37276190476190474, "grad_norm": 0.1494140625, "learning_rate": 0.1, "loss": 2.511558771133423, "step": 11742 }, { "epoch": 0.37282539682539684, "grad_norm": 0.1787109375, "learning_rate": 0.1, "loss": 2.4778945446014404, "step": 11744 }, { "epoch": 0.3728888888888889, "grad_norm": 0.365234375, "learning_rate": 0.1, "loss": 2.465162992477417, "step": 11746 }, { "epoch": 0.372952380952381, "grad_norm": 0.357421875, "learning_rate": 0.1, "loss": 2.4944188594818115, "step": 11748 }, { "epoch": 0.373015873015873, "grad_norm": 0.09814453125, "learning_rate": 0.1, "loss": 2.492619276046753, "step": 11750 }, { "epoch": 0.37307936507936507, "grad_norm": 0.10107421875, "learning_rate": 0.1, "loss": 2.499040365219116, "step": 11752 }, { "epoch": 0.37314285714285716, "grad_norm": 0.3125, "learning_rate": 0.1, "loss": 2.4819164276123047, "step": 11754 }, { "epoch": 0.3732063492063492, "grad_norm": 0.3125, "learning_rate": 0.1, "loss": 2.51578688621521, "step": 11756 }, { "epoch": 0.37326984126984125, "grad_norm": 0.068359375, "learning_rate": 0.1, "loss": 2.4873805046081543, "step": 11758 }, { "epoch": 0.37333333333333335, "grad_norm": 0.064453125, "learning_rate": 0.1, "loss": 2.5044052600860596, "step": 11760 }, { "epoch": 0.3733968253968254, "grad_norm": 0.125, "learning_rate": 0.1, "loss": 2.470613479614258, "step": 11762 }, { "epoch": 0.37346031746031744, "grad_norm": 0.0439453125, "learning_rate": 0.1, "loss": 2.4653987884521484, "step": 11764 }, { "epoch": 0.37352380952380954, "grad_norm": 0.29296875, "learning_rate": 0.1, "loss": 2.5110135078430176, "step": 11766 }, { "epoch": 0.3735873015873016, "grad_norm": 0.2255859375, "learning_rate": 0.1, "loss": 2.480912446975708, "step": 11768 }, { "epoch": 0.3736507936507937, "grad_norm": 0.10693359375, "learning_rate": 0.1, "loss": 2.505415916442871, "step": 11770 }, { "epoch": 0.3737142857142857, "grad_norm": 0.21875, "learning_rate": 0.1, "loss": 2.484025239944458, "step": 11772 }, { "epoch": 0.37377777777777776, "grad_norm": 0.11572265625, "learning_rate": 0.1, "loss": 2.478426933288574, "step": 11774 }, { "epoch": 0.37384126984126986, "grad_norm": 0.099609375, "learning_rate": 0.1, "loss": 2.497925043106079, "step": 11776 }, { "epoch": 0.3739047619047619, "grad_norm": 0.11572265625, "learning_rate": 0.1, "loss": 2.4825384616851807, "step": 11778 }, { "epoch": 0.37396825396825395, "grad_norm": 0.2470703125, "learning_rate": 0.1, "loss": 2.4915850162506104, "step": 11780 }, { "epoch": 0.37403174603174605, "grad_norm": 0.06103515625, "learning_rate": 0.1, "loss": 2.4559059143066406, "step": 11782 }, { "epoch": 0.3740952380952381, "grad_norm": 0.12109375, "learning_rate": 0.1, "loss": 2.5077600479125977, "step": 11784 }, { "epoch": 0.37415873015873014, "grad_norm": 0.4921875, "learning_rate": 0.1, "loss": 2.4866786003112793, "step": 11786 }, { "epoch": 0.37422222222222223, "grad_norm": 0.423828125, "learning_rate": 0.1, "loss": 2.4874205589294434, "step": 11788 }, { "epoch": 0.3742857142857143, "grad_norm": 0.0771484375, "learning_rate": 0.1, "loss": 2.462109088897705, "step": 11790 }, { "epoch": 0.3743492063492064, "grad_norm": 0.166015625, "learning_rate": 0.1, "loss": 2.4757614135742188, "step": 11792 }, { "epoch": 0.3744126984126984, "grad_norm": 0.1181640625, "learning_rate": 0.1, "loss": 2.506653308868408, "step": 11794 }, { "epoch": 0.37447619047619046, "grad_norm": 0.11328125, "learning_rate": 0.1, "loss": 2.4672343730926514, "step": 11796 }, { "epoch": 0.37453968253968256, "grad_norm": 0.10400390625, "learning_rate": 0.1, "loss": 2.47956919670105, "step": 11798 }, { "epoch": 0.3746031746031746, "grad_norm": 0.1298828125, "learning_rate": 0.1, "loss": 2.4795355796813965, "step": 11800 }, { "epoch": 0.37466666666666665, "grad_norm": 0.1337890625, "learning_rate": 0.1, "loss": 2.47666597366333, "step": 11802 }, { "epoch": 0.37473015873015875, "grad_norm": 0.3984375, "learning_rate": 0.1, "loss": 2.47880220413208, "step": 11804 }, { "epoch": 0.3747936507936508, "grad_norm": 0.5078125, "learning_rate": 0.1, "loss": 2.4636754989624023, "step": 11806 }, { "epoch": 0.37485714285714283, "grad_norm": 0.21875, "learning_rate": 0.1, "loss": 2.4849722385406494, "step": 11808 }, { "epoch": 0.37492063492063493, "grad_norm": 0.123046875, "learning_rate": 0.1, "loss": 2.4913809299468994, "step": 11810 }, { "epoch": 0.374984126984127, "grad_norm": 0.134765625, "learning_rate": 0.1, "loss": 2.4815680980682373, "step": 11812 }, { "epoch": 0.3750476190476191, "grad_norm": 0.25, "learning_rate": 0.1, "loss": 2.4828529357910156, "step": 11814 }, { "epoch": 0.3751111111111111, "grad_norm": 0.3203125, "learning_rate": 0.1, "loss": 2.46449875831604, "step": 11816 }, { "epoch": 0.37517460317460316, "grad_norm": 0.1787109375, "learning_rate": 0.1, "loss": 2.490966558456421, "step": 11818 }, { "epoch": 0.37523809523809526, "grad_norm": 0.04345703125, "learning_rate": 0.1, "loss": 2.482635021209717, "step": 11820 }, { "epoch": 0.3753015873015873, "grad_norm": 0.057861328125, "learning_rate": 0.1, "loss": 2.484558582305908, "step": 11822 }, { "epoch": 0.37536507936507935, "grad_norm": 0.087890625, "learning_rate": 0.1, "loss": 2.4617435932159424, "step": 11824 }, { "epoch": 0.37542857142857144, "grad_norm": 0.126953125, "learning_rate": 0.1, "loss": 2.461313009262085, "step": 11826 }, { "epoch": 0.3754920634920635, "grad_norm": 0.25390625, "learning_rate": 0.1, "loss": 2.452178716659546, "step": 11828 }, { "epoch": 0.37555555555555553, "grad_norm": 0.068359375, "learning_rate": 0.1, "loss": 2.481152057647705, "step": 11830 }, { "epoch": 0.37561904761904763, "grad_norm": 0.302734375, "learning_rate": 0.1, "loss": 2.4916257858276367, "step": 11832 }, { "epoch": 0.3756825396825397, "grad_norm": 0.58203125, "learning_rate": 0.1, "loss": 2.4957711696624756, "step": 11834 }, { "epoch": 0.3757460317460318, "grad_norm": 0.1513671875, "learning_rate": 0.1, "loss": 2.4785571098327637, "step": 11836 }, { "epoch": 0.3758095238095238, "grad_norm": 0.2177734375, "learning_rate": 0.1, "loss": 2.486116409301758, "step": 11838 }, { "epoch": 0.37587301587301586, "grad_norm": 0.203125, "learning_rate": 0.1, "loss": 2.504101514816284, "step": 11840 }, { "epoch": 0.37593650793650796, "grad_norm": 0.115234375, "learning_rate": 0.1, "loss": 2.473159074783325, "step": 11842 }, { "epoch": 0.376, "grad_norm": 0.1064453125, "learning_rate": 0.1, "loss": 2.475519895553589, "step": 11844 }, { "epoch": 0.37606349206349204, "grad_norm": 0.11865234375, "learning_rate": 0.1, "loss": 2.47436785697937, "step": 11846 }, { "epoch": 0.37612698412698414, "grad_norm": 0.310546875, "learning_rate": 0.1, "loss": 2.445840358734131, "step": 11848 }, { "epoch": 0.3761904761904762, "grad_norm": 0.244140625, "learning_rate": 0.1, "loss": 2.4474823474884033, "step": 11850 }, { "epoch": 0.37625396825396823, "grad_norm": 0.205078125, "learning_rate": 0.1, "loss": 2.493243932723999, "step": 11852 }, { "epoch": 0.37631746031746033, "grad_norm": 0.10595703125, "learning_rate": 0.1, "loss": 2.4680328369140625, "step": 11854 }, { "epoch": 0.37638095238095237, "grad_norm": 0.142578125, "learning_rate": 0.1, "loss": 2.492051601409912, "step": 11856 }, { "epoch": 0.37644444444444447, "grad_norm": 0.287109375, "learning_rate": 0.1, "loss": 2.478950023651123, "step": 11858 }, { "epoch": 0.3765079365079365, "grad_norm": 0.2109375, "learning_rate": 0.1, "loss": 2.4861087799072266, "step": 11860 }, { "epoch": 0.37657142857142856, "grad_norm": 0.12255859375, "learning_rate": 0.1, "loss": 2.466583490371704, "step": 11862 }, { "epoch": 0.37663492063492066, "grad_norm": 0.103515625, "learning_rate": 0.1, "loss": 2.4566826820373535, "step": 11864 }, { "epoch": 0.3766984126984127, "grad_norm": 0.310546875, "learning_rate": 0.1, "loss": 2.459071159362793, "step": 11866 }, { "epoch": 0.37676190476190474, "grad_norm": 0.369140625, "learning_rate": 0.1, "loss": 2.474431037902832, "step": 11868 }, { "epoch": 0.37682539682539684, "grad_norm": 0.10205078125, "learning_rate": 0.1, "loss": 2.474824905395508, "step": 11870 }, { "epoch": 0.3768888888888889, "grad_norm": 0.1416015625, "learning_rate": 0.1, "loss": 2.5060482025146484, "step": 11872 }, { "epoch": 0.3769523809523809, "grad_norm": 0.2275390625, "learning_rate": 0.1, "loss": 2.4836771488189697, "step": 11874 }, { "epoch": 0.377015873015873, "grad_norm": 0.1650390625, "learning_rate": 0.1, "loss": 2.4806151390075684, "step": 11876 }, { "epoch": 0.37707936507936507, "grad_norm": 0.2138671875, "learning_rate": 0.1, "loss": 2.4879519939422607, "step": 11878 }, { "epoch": 0.37714285714285717, "grad_norm": 0.3046875, "learning_rate": 0.1, "loss": 2.4853639602661133, "step": 11880 }, { "epoch": 0.3772063492063492, "grad_norm": 0.2080078125, "learning_rate": 0.1, "loss": 2.4878056049346924, "step": 11882 }, { "epoch": 0.37726984126984126, "grad_norm": 0.1533203125, "learning_rate": 0.1, "loss": 2.4815573692321777, "step": 11884 }, { "epoch": 0.37733333333333335, "grad_norm": 0.11474609375, "learning_rate": 0.1, "loss": 2.4892303943634033, "step": 11886 }, { "epoch": 0.3773968253968254, "grad_norm": 0.18359375, "learning_rate": 0.1, "loss": 2.485851526260376, "step": 11888 }, { "epoch": 0.37746031746031744, "grad_norm": 0.294921875, "learning_rate": 0.1, "loss": 2.45359468460083, "step": 11890 }, { "epoch": 0.37752380952380954, "grad_norm": 0.482421875, "learning_rate": 0.1, "loss": 2.4731178283691406, "step": 11892 }, { "epoch": 0.3775873015873016, "grad_norm": 0.33203125, "learning_rate": 0.1, "loss": 2.44585919380188, "step": 11894 }, { "epoch": 0.3776507936507936, "grad_norm": 0.1826171875, "learning_rate": 0.1, "loss": 2.453669786453247, "step": 11896 }, { "epoch": 0.3777142857142857, "grad_norm": 0.08251953125, "learning_rate": 0.1, "loss": 2.4264750480651855, "step": 11898 }, { "epoch": 0.37777777777777777, "grad_norm": 0.12158203125, "learning_rate": 0.1, "loss": 2.466355323791504, "step": 11900 }, { "epoch": 0.37784126984126987, "grad_norm": 0.208984375, "learning_rate": 0.1, "loss": 2.453779458999634, "step": 11902 }, { "epoch": 0.3779047619047619, "grad_norm": 0.10205078125, "learning_rate": 0.1, "loss": 2.4556055068969727, "step": 11904 }, { "epoch": 0.37796825396825395, "grad_norm": 0.375, "learning_rate": 0.1, "loss": 2.4727084636688232, "step": 11906 }, { "epoch": 0.37803174603174605, "grad_norm": 0.177734375, "learning_rate": 0.1, "loss": 2.430933713912964, "step": 11908 }, { "epoch": 0.3780952380952381, "grad_norm": 0.1572265625, "learning_rate": 0.1, "loss": 2.452418327331543, "step": 11910 }, { "epoch": 0.37815873015873014, "grad_norm": 0.2431640625, "learning_rate": 0.1, "loss": 2.4696695804595947, "step": 11912 }, { "epoch": 0.37822222222222224, "grad_norm": 0.1083984375, "learning_rate": 0.1, "loss": 2.457073211669922, "step": 11914 }, { "epoch": 0.3782857142857143, "grad_norm": 0.0888671875, "learning_rate": 0.1, "loss": 2.4525954723358154, "step": 11916 }, { "epoch": 0.3783492063492063, "grad_norm": 0.1435546875, "learning_rate": 0.1, "loss": 2.467282772064209, "step": 11918 }, { "epoch": 0.3784126984126984, "grad_norm": 0.158203125, "learning_rate": 0.1, "loss": 2.4599666595458984, "step": 11920 }, { "epoch": 0.37847619047619047, "grad_norm": 0.435546875, "learning_rate": 0.1, "loss": 2.4501702785491943, "step": 11922 }, { "epoch": 0.37853968253968256, "grad_norm": 0.447265625, "learning_rate": 0.1, "loss": 2.4939215183258057, "step": 11924 }, { "epoch": 0.3786031746031746, "grad_norm": 0.123046875, "learning_rate": 0.1, "loss": 2.448594093322754, "step": 11926 }, { "epoch": 0.37866666666666665, "grad_norm": 0.09619140625, "learning_rate": 0.1, "loss": 2.4669272899627686, "step": 11928 }, { "epoch": 0.37873015873015875, "grad_norm": 0.2421875, "learning_rate": 0.1, "loss": 2.474453926086426, "step": 11930 }, { "epoch": 0.3787936507936508, "grad_norm": 0.263671875, "learning_rate": 0.1, "loss": 2.4715001583099365, "step": 11932 }, { "epoch": 0.37885714285714284, "grad_norm": 0.365234375, "learning_rate": 0.1, "loss": 2.4608728885650635, "step": 11934 }, { "epoch": 0.37892063492063494, "grad_norm": 0.302734375, "learning_rate": 0.1, "loss": 2.483308792114258, "step": 11936 }, { "epoch": 0.378984126984127, "grad_norm": 0.10791015625, "learning_rate": 0.1, "loss": 2.484426975250244, "step": 11938 }, { "epoch": 0.379047619047619, "grad_norm": 0.0634765625, "learning_rate": 0.1, "loss": 2.478487968444824, "step": 11940 }, { "epoch": 0.3791111111111111, "grad_norm": 0.07177734375, "learning_rate": 0.1, "loss": 2.434561014175415, "step": 11942 }, { "epoch": 0.37917460317460316, "grad_norm": 0.057861328125, "learning_rate": 0.1, "loss": 2.4459564685821533, "step": 11944 }, { "epoch": 0.37923809523809526, "grad_norm": 0.130859375, "learning_rate": 0.1, "loss": 2.4592580795288086, "step": 11946 }, { "epoch": 0.3793015873015873, "grad_norm": 0.08447265625, "learning_rate": 0.1, "loss": 2.459186315536499, "step": 11948 }, { "epoch": 0.37936507936507935, "grad_norm": 0.12890625, "learning_rate": 0.1, "loss": 2.458428144454956, "step": 11950 }, { "epoch": 0.37942857142857145, "grad_norm": 0.201171875, "learning_rate": 0.1, "loss": 2.4365906715393066, "step": 11952 }, { "epoch": 0.3794920634920635, "grad_norm": 0.318359375, "learning_rate": 0.1, "loss": 2.4577033519744873, "step": 11954 }, { "epoch": 0.37955555555555553, "grad_norm": 0.166015625, "learning_rate": 0.1, "loss": 2.445676565170288, "step": 11956 }, { "epoch": 0.37961904761904763, "grad_norm": 0.130859375, "learning_rate": 0.1, "loss": 2.4809117317199707, "step": 11958 }, { "epoch": 0.3796825396825397, "grad_norm": 0.359375, "learning_rate": 0.1, "loss": 2.446380615234375, "step": 11960 }, { "epoch": 0.3797460317460317, "grad_norm": 0.44921875, "learning_rate": 0.1, "loss": 2.4621827602386475, "step": 11962 }, { "epoch": 0.3798095238095238, "grad_norm": 0.1435546875, "learning_rate": 0.1, "loss": 2.434882402420044, "step": 11964 }, { "epoch": 0.37987301587301586, "grad_norm": 0.087890625, "learning_rate": 0.1, "loss": 2.4461870193481445, "step": 11966 }, { "epoch": 0.37993650793650796, "grad_norm": 0.1494140625, "learning_rate": 0.1, "loss": 2.4736037254333496, "step": 11968 }, { "epoch": 0.38, "grad_norm": 0.087890625, "learning_rate": 0.1, "loss": 2.4388091564178467, "step": 11970 }, { "epoch": 0.38006349206349205, "grad_norm": 0.1318359375, "learning_rate": 0.1, "loss": 2.4562020301818848, "step": 11972 }, { "epoch": 0.38012698412698415, "grad_norm": 0.17578125, "learning_rate": 0.1, "loss": 2.4419918060302734, "step": 11974 }, { "epoch": 0.3801904761904762, "grad_norm": 0.283203125, "learning_rate": 0.1, "loss": 2.4348676204681396, "step": 11976 }, { "epoch": 0.38025396825396823, "grad_norm": 0.2177734375, "learning_rate": 0.1, "loss": 2.4611902236938477, "step": 11978 }, { "epoch": 0.38031746031746033, "grad_norm": 0.1396484375, "learning_rate": 0.1, "loss": 2.4563024044036865, "step": 11980 }, { "epoch": 0.3803809523809524, "grad_norm": 0.16015625, "learning_rate": 0.1, "loss": 2.445188045501709, "step": 11982 }, { "epoch": 0.3804444444444444, "grad_norm": 0.205078125, "learning_rate": 0.1, "loss": 2.474489212036133, "step": 11984 }, { "epoch": 0.3805079365079365, "grad_norm": 0.39453125, "learning_rate": 0.1, "loss": 2.4510064125061035, "step": 11986 }, { "epoch": 0.38057142857142856, "grad_norm": 0.236328125, "learning_rate": 0.1, "loss": 2.494896650314331, "step": 11988 }, { "epoch": 0.38063492063492066, "grad_norm": 0.1552734375, "learning_rate": 0.1, "loss": 2.458981513977051, "step": 11990 }, { "epoch": 0.3806984126984127, "grad_norm": 0.287109375, "learning_rate": 0.1, "loss": 2.4623570442199707, "step": 11992 }, { "epoch": 0.38076190476190475, "grad_norm": 0.337890625, "learning_rate": 0.1, "loss": 2.4659810066223145, "step": 11994 }, { "epoch": 0.38082539682539684, "grad_norm": 0.07958984375, "learning_rate": 0.1, "loss": 2.465498447418213, "step": 11996 }, { "epoch": 0.3808888888888889, "grad_norm": 0.0859375, "learning_rate": 0.1, "loss": 2.4689385890960693, "step": 11998 }, { "epoch": 0.38095238095238093, "grad_norm": 0.1826171875, "learning_rate": 0.1, "loss": 2.4477922916412354, "step": 12000 }, { "epoch": 0.38101587301587303, "grad_norm": 0.6484375, "learning_rate": 0.1, "loss": 2.4727680683135986, "step": 12002 }, { "epoch": 0.3810793650793651, "grad_norm": 0.1787109375, "learning_rate": 0.1, "loss": 2.4675381183624268, "step": 12004 }, { "epoch": 0.3811428571428571, "grad_norm": 0.060302734375, "learning_rate": 0.1, "loss": 2.453152894973755, "step": 12006 }, { "epoch": 0.3812063492063492, "grad_norm": 0.2158203125, "learning_rate": 0.1, "loss": 2.4358911514282227, "step": 12008 }, { "epoch": 0.38126984126984126, "grad_norm": 0.21875, "learning_rate": 0.1, "loss": 2.4744372367858887, "step": 12010 }, { "epoch": 0.38133333333333336, "grad_norm": 0.09912109375, "learning_rate": 0.1, "loss": 2.45621919631958, "step": 12012 }, { "epoch": 0.3813968253968254, "grad_norm": 0.2060546875, "learning_rate": 0.1, "loss": 2.468158006668091, "step": 12014 }, { "epoch": 0.38146031746031744, "grad_norm": 0.205078125, "learning_rate": 0.1, "loss": 2.4647765159606934, "step": 12016 }, { "epoch": 0.38152380952380954, "grad_norm": 0.1494140625, "learning_rate": 0.1, "loss": 2.4902184009552, "step": 12018 }, { "epoch": 0.3815873015873016, "grad_norm": 0.06884765625, "learning_rate": 0.1, "loss": 2.4720568656921387, "step": 12020 }, { "epoch": 0.38165079365079363, "grad_norm": 0.1767578125, "learning_rate": 0.1, "loss": 2.4778892993927, "step": 12022 }, { "epoch": 0.38171428571428573, "grad_norm": 0.2197265625, "learning_rate": 0.1, "loss": 2.46598482131958, "step": 12024 }, { "epoch": 0.38177777777777777, "grad_norm": 0.302734375, "learning_rate": 0.1, "loss": 2.4907238483428955, "step": 12026 }, { "epoch": 0.3818412698412698, "grad_norm": 0.1455078125, "learning_rate": 0.1, "loss": 2.474137783050537, "step": 12028 }, { "epoch": 0.3819047619047619, "grad_norm": 0.283203125, "learning_rate": 0.1, "loss": 2.4604177474975586, "step": 12030 }, { "epoch": 0.38196825396825396, "grad_norm": 0.25, "learning_rate": 0.1, "loss": 2.4689693450927734, "step": 12032 }, { "epoch": 0.38203174603174606, "grad_norm": 0.0859375, "learning_rate": 0.1, "loss": 2.458057403564453, "step": 12034 }, { "epoch": 0.3820952380952381, "grad_norm": 0.1318359375, "learning_rate": 0.1, "loss": 2.489271879196167, "step": 12036 }, { "epoch": 0.38215873015873014, "grad_norm": 0.12109375, "learning_rate": 0.1, "loss": 2.4576919078826904, "step": 12038 }, { "epoch": 0.38222222222222224, "grad_norm": 0.228515625, "learning_rate": 0.1, "loss": 2.463383674621582, "step": 12040 }, { "epoch": 0.3822857142857143, "grad_norm": 0.322265625, "learning_rate": 0.1, "loss": 2.4628543853759766, "step": 12042 }, { "epoch": 0.3823492063492063, "grad_norm": 0.291015625, "learning_rate": 0.1, "loss": 2.4454421997070312, "step": 12044 }, { "epoch": 0.3824126984126984, "grad_norm": 0.06005859375, "learning_rate": 0.1, "loss": 2.4660277366638184, "step": 12046 }, { "epoch": 0.38247619047619047, "grad_norm": 0.216796875, "learning_rate": 0.1, "loss": 2.484807252883911, "step": 12048 }, { "epoch": 0.3825396825396825, "grad_norm": 0.380859375, "learning_rate": 0.1, "loss": 2.485915184020996, "step": 12050 }, { "epoch": 0.3826031746031746, "grad_norm": 0.3046875, "learning_rate": 0.1, "loss": 2.4765751361846924, "step": 12052 }, { "epoch": 0.38266666666666665, "grad_norm": 0.0615234375, "learning_rate": 0.1, "loss": 2.45377516746521, "step": 12054 }, { "epoch": 0.38273015873015875, "grad_norm": 0.1005859375, "learning_rate": 0.1, "loss": 2.477993965148926, "step": 12056 }, { "epoch": 0.3827936507936508, "grad_norm": 0.1787109375, "learning_rate": 0.1, "loss": 2.476003646850586, "step": 12058 }, { "epoch": 0.38285714285714284, "grad_norm": 0.26171875, "learning_rate": 0.1, "loss": 2.4513702392578125, "step": 12060 }, { "epoch": 0.38292063492063494, "grad_norm": 0.2490234375, "learning_rate": 0.1, "loss": 2.481924295425415, "step": 12062 }, { "epoch": 0.382984126984127, "grad_norm": 0.2119140625, "learning_rate": 0.1, "loss": 2.4330499172210693, "step": 12064 }, { "epoch": 0.383047619047619, "grad_norm": 0.1015625, "learning_rate": 0.1, "loss": 2.4708168506622314, "step": 12066 }, { "epoch": 0.3831111111111111, "grad_norm": 0.07421875, "learning_rate": 0.1, "loss": 2.4653377532958984, "step": 12068 }, { "epoch": 0.38317460317460317, "grad_norm": 0.16796875, "learning_rate": 0.1, "loss": 2.5031898021698, "step": 12070 }, { "epoch": 0.3832380952380952, "grad_norm": 0.1279296875, "learning_rate": 0.1, "loss": 2.475032329559326, "step": 12072 }, { "epoch": 0.3833015873015873, "grad_norm": 0.11865234375, "learning_rate": 0.1, "loss": 2.4548768997192383, "step": 12074 }, { "epoch": 0.38336507936507935, "grad_norm": 0.337890625, "learning_rate": 0.1, "loss": 2.457284688949585, "step": 12076 }, { "epoch": 0.38342857142857145, "grad_norm": 0.2265625, "learning_rate": 0.1, "loss": 2.4320826530456543, "step": 12078 }, { "epoch": 0.3834920634920635, "grad_norm": 0.0908203125, "learning_rate": 0.1, "loss": 2.460562229156494, "step": 12080 }, { "epoch": 0.38355555555555554, "grad_norm": 0.44140625, "learning_rate": 0.1, "loss": 2.4694409370422363, "step": 12082 }, { "epoch": 0.38361904761904764, "grad_norm": 0.21484375, "learning_rate": 0.1, "loss": 2.4640471935272217, "step": 12084 }, { "epoch": 0.3836825396825397, "grad_norm": 0.11181640625, "learning_rate": 0.1, "loss": 2.480008363723755, "step": 12086 }, { "epoch": 0.3837460317460317, "grad_norm": 0.09765625, "learning_rate": 0.1, "loss": 2.4768412113189697, "step": 12088 }, { "epoch": 0.3838095238095238, "grad_norm": 0.1591796875, "learning_rate": 0.1, "loss": 2.4503283500671387, "step": 12090 }, { "epoch": 0.38387301587301587, "grad_norm": 0.474609375, "learning_rate": 0.1, "loss": 2.4610016345977783, "step": 12092 }, { "epoch": 0.3839365079365079, "grad_norm": 0.486328125, "learning_rate": 0.1, "loss": 2.4768073558807373, "step": 12094 }, { "epoch": 0.384, "grad_norm": 0.046142578125, "learning_rate": 0.1, "loss": 2.4615330696105957, "step": 12096 }, { "epoch": 0.38406349206349205, "grad_norm": 0.16015625, "learning_rate": 0.1, "loss": 2.4912209510803223, "step": 12098 }, { "epoch": 0.38412698412698415, "grad_norm": 0.2109375, "learning_rate": 0.1, "loss": 2.4521989822387695, "step": 12100 }, { "epoch": 0.3841904761904762, "grad_norm": 0.1845703125, "learning_rate": 0.1, "loss": 2.466559886932373, "step": 12102 }, { "epoch": 0.38425396825396824, "grad_norm": 0.1630859375, "learning_rate": 0.1, "loss": 2.465717315673828, "step": 12104 }, { "epoch": 0.38431746031746034, "grad_norm": 0.1630859375, "learning_rate": 0.1, "loss": 2.475071907043457, "step": 12106 }, { "epoch": 0.3843809523809524, "grad_norm": 0.470703125, "learning_rate": 0.1, "loss": 2.5202531814575195, "step": 12108 }, { "epoch": 0.3844444444444444, "grad_norm": 0.470703125, "learning_rate": 0.1, "loss": 2.4715182781219482, "step": 12110 }, { "epoch": 0.3845079365079365, "grad_norm": 0.1162109375, "learning_rate": 0.1, "loss": 2.455341100692749, "step": 12112 }, { "epoch": 0.38457142857142856, "grad_norm": 0.10400390625, "learning_rate": 0.1, "loss": 2.4790189266204834, "step": 12114 }, { "epoch": 0.3846349206349206, "grad_norm": 0.06884765625, "learning_rate": 0.1, "loss": 2.47458553314209, "step": 12116 }, { "epoch": 0.3846984126984127, "grad_norm": 0.07763671875, "learning_rate": 0.1, "loss": 2.475902557373047, "step": 12118 }, { "epoch": 0.38476190476190475, "grad_norm": 0.12255859375, "learning_rate": 0.1, "loss": 2.498180389404297, "step": 12120 }, { "epoch": 0.38482539682539685, "grad_norm": 0.3828125, "learning_rate": 0.1, "loss": 2.507518768310547, "step": 12122 }, { "epoch": 0.3848888888888889, "grad_norm": 0.2041015625, "learning_rate": 0.1, "loss": 2.4781503677368164, "step": 12124 }, { "epoch": 0.38495238095238093, "grad_norm": 0.06396484375, "learning_rate": 0.1, "loss": 2.5078864097595215, "step": 12126 }, { "epoch": 0.38501587301587303, "grad_norm": 0.0859375, "learning_rate": 0.1, "loss": 2.479459285736084, "step": 12128 }, { "epoch": 0.3850793650793651, "grad_norm": 0.138671875, "learning_rate": 0.1, "loss": 2.5114645957946777, "step": 12130 }, { "epoch": 0.3851428571428571, "grad_norm": 0.29296875, "learning_rate": 0.1, "loss": 2.4905712604522705, "step": 12132 }, { "epoch": 0.3852063492063492, "grad_norm": 0.12060546875, "learning_rate": 0.1, "loss": 2.4662930965423584, "step": 12134 }, { "epoch": 0.38526984126984126, "grad_norm": 0.177734375, "learning_rate": 0.1, "loss": 2.484269857406616, "step": 12136 }, { "epoch": 0.38533333333333336, "grad_norm": 0.265625, "learning_rate": 0.1, "loss": 2.501328229904175, "step": 12138 }, { "epoch": 0.3853968253968254, "grad_norm": 0.068359375, "learning_rate": 0.1, "loss": 2.4805939197540283, "step": 12140 }, { "epoch": 0.38546031746031745, "grad_norm": 0.2392578125, "learning_rate": 0.1, "loss": 2.4765126705169678, "step": 12142 }, { "epoch": 0.38552380952380955, "grad_norm": 0.365234375, "learning_rate": 0.1, "loss": 2.500314474105835, "step": 12144 }, { "epoch": 0.3855873015873016, "grad_norm": 0.125, "learning_rate": 0.1, "loss": 2.5027472972869873, "step": 12146 }, { "epoch": 0.38565079365079363, "grad_norm": 0.2890625, "learning_rate": 0.1, "loss": 2.473562240600586, "step": 12148 }, { "epoch": 0.38571428571428573, "grad_norm": 0.2890625, "learning_rate": 0.1, "loss": 2.511924982070923, "step": 12150 }, { "epoch": 0.3857777777777778, "grad_norm": 0.140625, "learning_rate": 0.1, "loss": 2.4819207191467285, "step": 12152 }, { "epoch": 0.3858412698412698, "grad_norm": 0.06591796875, "learning_rate": 0.1, "loss": 2.4831345081329346, "step": 12154 }, { "epoch": 0.3859047619047619, "grad_norm": 0.0849609375, "learning_rate": 0.1, "loss": 2.4863007068634033, "step": 12156 }, { "epoch": 0.38596825396825396, "grad_norm": 0.30859375, "learning_rate": 0.1, "loss": 2.460641622543335, "step": 12158 }, { "epoch": 0.38603174603174606, "grad_norm": 0.392578125, "learning_rate": 0.1, "loss": 2.4990243911743164, "step": 12160 }, { "epoch": 0.3860952380952381, "grad_norm": 0.1455078125, "learning_rate": 0.1, "loss": 2.5030386447906494, "step": 12162 }, { "epoch": 0.38615873015873015, "grad_norm": 0.248046875, "learning_rate": 0.1, "loss": 2.4420783519744873, "step": 12164 }, { "epoch": 0.38622222222222224, "grad_norm": 0.341796875, "learning_rate": 0.1, "loss": 2.504606008529663, "step": 12166 }, { "epoch": 0.3862857142857143, "grad_norm": 0.06640625, "learning_rate": 0.1, "loss": 2.5177011489868164, "step": 12168 }, { "epoch": 0.38634920634920633, "grad_norm": 0.119140625, "learning_rate": 0.1, "loss": 2.5138041973114014, "step": 12170 }, { "epoch": 0.38641269841269843, "grad_norm": 0.11181640625, "learning_rate": 0.1, "loss": 2.4538416862487793, "step": 12172 }, { "epoch": 0.3864761904761905, "grad_norm": 0.1640625, "learning_rate": 0.1, "loss": 2.4983925819396973, "step": 12174 }, { "epoch": 0.3865396825396825, "grad_norm": 0.177734375, "learning_rate": 0.1, "loss": 2.472153425216675, "step": 12176 }, { "epoch": 0.3866031746031746, "grad_norm": 0.08544921875, "learning_rate": 0.1, "loss": 2.4816946983337402, "step": 12178 }, { "epoch": 0.38666666666666666, "grad_norm": 0.1767578125, "learning_rate": 0.1, "loss": 2.5129659175872803, "step": 12180 }, { "epoch": 0.38673015873015876, "grad_norm": 0.11376953125, "learning_rate": 0.1, "loss": 2.4627156257629395, "step": 12182 }, { "epoch": 0.3867936507936508, "grad_norm": 0.3984375, "learning_rate": 0.1, "loss": 2.494533061981201, "step": 12184 }, { "epoch": 0.38685714285714284, "grad_norm": 0.3515625, "learning_rate": 0.1, "loss": 2.471395254135132, "step": 12186 }, { "epoch": 0.38692063492063494, "grad_norm": 0.1337890625, "learning_rate": 0.1, "loss": 2.4891700744628906, "step": 12188 }, { "epoch": 0.386984126984127, "grad_norm": 0.1474609375, "learning_rate": 0.1, "loss": 2.5044522285461426, "step": 12190 }, { "epoch": 0.38704761904761903, "grad_norm": 0.2890625, "learning_rate": 0.1, "loss": 2.487046241760254, "step": 12192 }, { "epoch": 0.38711111111111113, "grad_norm": 0.1708984375, "learning_rate": 0.1, "loss": 2.514503002166748, "step": 12194 }, { "epoch": 0.38717460317460317, "grad_norm": 0.232421875, "learning_rate": 0.1, "loss": 2.4909095764160156, "step": 12196 }, { "epoch": 0.3872380952380952, "grad_norm": 0.224609375, "learning_rate": 0.1, "loss": 2.4744057655334473, "step": 12198 }, { "epoch": 0.3873015873015873, "grad_norm": 0.1103515625, "learning_rate": 0.1, "loss": 2.4896419048309326, "step": 12200 }, { "epoch": 0.38736507936507936, "grad_norm": 0.119140625, "learning_rate": 0.1, "loss": 2.4401516914367676, "step": 12202 }, { "epoch": 0.38742857142857146, "grad_norm": 0.103515625, "learning_rate": 0.1, "loss": 2.45546293258667, "step": 12204 }, { "epoch": 0.3874920634920635, "grad_norm": 0.203125, "learning_rate": 0.1, "loss": 2.4839749336242676, "step": 12206 }, { "epoch": 0.38755555555555554, "grad_norm": 0.09375, "learning_rate": 0.1, "loss": 2.457061767578125, "step": 12208 }, { "epoch": 0.38761904761904764, "grad_norm": 0.1591796875, "learning_rate": 0.1, "loss": 2.4803502559661865, "step": 12210 }, { "epoch": 0.3876825396825397, "grad_norm": 0.10791015625, "learning_rate": 0.1, "loss": 2.4752790927886963, "step": 12212 }, { "epoch": 0.3877460317460317, "grad_norm": 0.1796875, "learning_rate": 0.1, "loss": 2.475409746170044, "step": 12214 }, { "epoch": 0.3878095238095238, "grad_norm": 0.369140625, "learning_rate": 0.1, "loss": 2.4633171558380127, "step": 12216 }, { "epoch": 0.38787301587301587, "grad_norm": 0.1923828125, "learning_rate": 0.1, "loss": 2.451033115386963, "step": 12218 }, { "epoch": 0.3879365079365079, "grad_norm": 0.263671875, "learning_rate": 0.1, "loss": 2.4517745971679688, "step": 12220 }, { "epoch": 0.388, "grad_norm": 0.267578125, "learning_rate": 0.1, "loss": 2.4649953842163086, "step": 12222 }, { "epoch": 0.38806349206349205, "grad_norm": 0.11376953125, "learning_rate": 0.1, "loss": 2.4534993171691895, "step": 12224 }, { "epoch": 0.38812698412698415, "grad_norm": 0.06884765625, "learning_rate": 0.1, "loss": 2.4374120235443115, "step": 12226 }, { "epoch": 0.3881904761904762, "grad_norm": 0.1123046875, "learning_rate": 0.1, "loss": 2.444390296936035, "step": 12228 }, { "epoch": 0.38825396825396824, "grad_norm": 0.1640625, "learning_rate": 0.1, "loss": 2.4295685291290283, "step": 12230 }, { "epoch": 0.38831746031746034, "grad_norm": 0.267578125, "learning_rate": 0.1, "loss": 2.4740664958953857, "step": 12232 }, { "epoch": 0.3883809523809524, "grad_norm": 0.150390625, "learning_rate": 0.1, "loss": 2.4552464485168457, "step": 12234 }, { "epoch": 0.3884444444444444, "grad_norm": 0.248046875, "learning_rate": 0.1, "loss": 2.469726085662842, "step": 12236 }, { "epoch": 0.3885079365079365, "grad_norm": 0.423828125, "learning_rate": 0.1, "loss": 2.4829745292663574, "step": 12238 }, { "epoch": 0.38857142857142857, "grad_norm": 0.337890625, "learning_rate": 0.1, "loss": 2.461160182952881, "step": 12240 }, { "epoch": 0.3886349206349206, "grad_norm": 0.10888671875, "learning_rate": 0.1, "loss": 2.465991973876953, "step": 12242 }, { "epoch": 0.3886984126984127, "grad_norm": 0.181640625, "learning_rate": 0.1, "loss": 2.464043617248535, "step": 12244 }, { "epoch": 0.38876190476190475, "grad_norm": 0.140625, "learning_rate": 0.1, "loss": 2.4663949012756348, "step": 12246 }, { "epoch": 0.38882539682539685, "grad_norm": 0.15625, "learning_rate": 0.1, "loss": 2.4549779891967773, "step": 12248 }, { "epoch": 0.3888888888888889, "grad_norm": 0.140625, "learning_rate": 0.1, "loss": 2.4570398330688477, "step": 12250 }, { "epoch": 0.38895238095238094, "grad_norm": 0.10400390625, "learning_rate": 0.1, "loss": 2.4533965587615967, "step": 12252 }, { "epoch": 0.38901587301587304, "grad_norm": 0.1552734375, "learning_rate": 0.1, "loss": 2.461820363998413, "step": 12254 }, { "epoch": 0.3890793650793651, "grad_norm": 0.380859375, "learning_rate": 0.1, "loss": 2.473161220550537, "step": 12256 }, { "epoch": 0.3891428571428571, "grad_norm": 0.5859375, "learning_rate": 0.1, "loss": 2.4561400413513184, "step": 12258 }, { "epoch": 0.3892063492063492, "grad_norm": 0.1279296875, "learning_rate": 0.1, "loss": 2.4945881366729736, "step": 12260 }, { "epoch": 0.38926984126984127, "grad_norm": 0.1787109375, "learning_rate": 0.1, "loss": 2.4576754570007324, "step": 12262 }, { "epoch": 0.3893333333333333, "grad_norm": 0.1484375, "learning_rate": 0.1, "loss": 2.4777143001556396, "step": 12264 }, { "epoch": 0.3893968253968254, "grad_norm": 0.25390625, "learning_rate": 0.1, "loss": 2.466712713241577, "step": 12266 }, { "epoch": 0.38946031746031745, "grad_norm": 0.3359375, "learning_rate": 0.1, "loss": 2.4671568870544434, "step": 12268 }, { "epoch": 0.38952380952380955, "grad_norm": 0.2138671875, "learning_rate": 0.1, "loss": 2.449784517288208, "step": 12270 }, { "epoch": 0.3895873015873016, "grad_norm": 0.09130859375, "learning_rate": 0.1, "loss": 2.468003273010254, "step": 12272 }, { "epoch": 0.38965079365079364, "grad_norm": 0.099609375, "learning_rate": 0.1, "loss": 2.455387830734253, "step": 12274 }, { "epoch": 0.38971428571428574, "grad_norm": 0.052490234375, "learning_rate": 0.1, "loss": 2.445190191268921, "step": 12276 }, { "epoch": 0.3897777777777778, "grad_norm": 0.08056640625, "learning_rate": 0.1, "loss": 2.44834041595459, "step": 12278 }, { "epoch": 0.3898412698412698, "grad_norm": 0.25, "learning_rate": 0.1, "loss": 2.4347023963928223, "step": 12280 }, { "epoch": 0.3899047619047619, "grad_norm": 0.296875, "learning_rate": 0.1, "loss": 2.424799680709839, "step": 12282 }, { "epoch": 0.38996825396825396, "grad_norm": 0.2060546875, "learning_rate": 0.1, "loss": 2.452523708343506, "step": 12284 }, { "epoch": 0.390031746031746, "grad_norm": 0.109375, "learning_rate": 0.1, "loss": 2.4557151794433594, "step": 12286 }, { "epoch": 0.3900952380952381, "grad_norm": 0.1884765625, "learning_rate": 0.1, "loss": 2.424776315689087, "step": 12288 }, { "epoch": 0.39015873015873015, "grad_norm": 0.279296875, "learning_rate": 0.1, "loss": 2.4696648120880127, "step": 12290 }, { "epoch": 0.39022222222222225, "grad_norm": 0.1064453125, "learning_rate": 0.1, "loss": 2.4684576988220215, "step": 12292 }, { "epoch": 0.3902857142857143, "grad_norm": 0.1806640625, "learning_rate": 0.1, "loss": 2.461087942123413, "step": 12294 }, { "epoch": 0.39034920634920633, "grad_norm": 0.203125, "learning_rate": 0.1, "loss": 2.430222749710083, "step": 12296 }, { "epoch": 0.39041269841269843, "grad_norm": 0.35546875, "learning_rate": 0.1, "loss": 2.4839582443237305, "step": 12298 }, { "epoch": 0.3904761904761905, "grad_norm": 0.1611328125, "learning_rate": 0.1, "loss": 2.4276487827301025, "step": 12300 }, { "epoch": 0.3905396825396825, "grad_norm": 0.123046875, "learning_rate": 0.1, "loss": 2.4334208965301514, "step": 12302 }, { "epoch": 0.3906031746031746, "grad_norm": 0.2041015625, "learning_rate": 0.1, "loss": 2.4657487869262695, "step": 12304 }, { "epoch": 0.39066666666666666, "grad_norm": 0.287109375, "learning_rate": 0.1, "loss": 2.4697165489196777, "step": 12306 }, { "epoch": 0.3907301587301587, "grad_norm": 0.263671875, "learning_rate": 0.1, "loss": 2.4890670776367188, "step": 12308 }, { "epoch": 0.3907936507936508, "grad_norm": 0.078125, "learning_rate": 0.1, "loss": 2.482919931411743, "step": 12310 }, { "epoch": 0.39085714285714285, "grad_norm": 0.1845703125, "learning_rate": 0.1, "loss": 2.482670545578003, "step": 12312 }, { "epoch": 0.39092063492063495, "grad_norm": 0.4765625, "learning_rate": 0.1, "loss": 2.473506212234497, "step": 12314 }, { "epoch": 0.390984126984127, "grad_norm": 0.11865234375, "learning_rate": 0.1, "loss": 2.484362840652466, "step": 12316 }, { "epoch": 0.39104761904761903, "grad_norm": 0.09375, "learning_rate": 0.1, "loss": 2.49501895904541, "step": 12318 }, { "epoch": 0.39111111111111113, "grad_norm": 0.1044921875, "learning_rate": 0.1, "loss": 2.473717451095581, "step": 12320 }, { "epoch": 0.3911746031746032, "grad_norm": 0.13671875, "learning_rate": 0.1, "loss": 2.4876880645751953, "step": 12322 }, { "epoch": 0.3912380952380952, "grad_norm": 0.220703125, "learning_rate": 0.1, "loss": 2.4805827140808105, "step": 12324 }, { "epoch": 0.3913015873015873, "grad_norm": 0.31640625, "learning_rate": 0.1, "loss": 2.508021593093872, "step": 12326 }, { "epoch": 0.39136507936507936, "grad_norm": 0.33984375, "learning_rate": 0.1, "loss": 2.4558026790618896, "step": 12328 }, { "epoch": 0.3914285714285714, "grad_norm": 0.138671875, "learning_rate": 0.1, "loss": 2.4671530723571777, "step": 12330 }, { "epoch": 0.3914920634920635, "grad_norm": 0.1435546875, "learning_rate": 0.1, "loss": 2.4863462448120117, "step": 12332 }, { "epoch": 0.39155555555555555, "grad_norm": 0.2041015625, "learning_rate": 0.1, "loss": 2.462531328201294, "step": 12334 }, { "epoch": 0.39161904761904764, "grad_norm": 0.11865234375, "learning_rate": 0.1, "loss": 2.4865562915802, "step": 12336 }, { "epoch": 0.3916825396825397, "grad_norm": 0.12353515625, "learning_rate": 0.1, "loss": 2.481738805770874, "step": 12338 }, { "epoch": 0.39174603174603173, "grad_norm": 0.130859375, "learning_rate": 0.1, "loss": 2.490565061569214, "step": 12340 }, { "epoch": 0.39180952380952383, "grad_norm": 0.1826171875, "learning_rate": 0.1, "loss": 2.481773614883423, "step": 12342 }, { "epoch": 0.3918730158730159, "grad_norm": 0.50390625, "learning_rate": 0.1, "loss": 2.4730658531188965, "step": 12344 }, { "epoch": 0.3919365079365079, "grad_norm": 0.298828125, "learning_rate": 0.1, "loss": 2.4841883182525635, "step": 12346 }, { "epoch": 0.392, "grad_norm": 0.1572265625, "learning_rate": 0.1, "loss": 2.493680715560913, "step": 12348 }, { "epoch": 0.39206349206349206, "grad_norm": 0.130859375, "learning_rate": 0.1, "loss": 2.502474784851074, "step": 12350 }, { "epoch": 0.3921269841269841, "grad_norm": 0.1591796875, "learning_rate": 0.1, "loss": 2.4785163402557373, "step": 12352 }, { "epoch": 0.3921904761904762, "grad_norm": 0.4296875, "learning_rate": 0.1, "loss": 2.491947650909424, "step": 12354 }, { "epoch": 0.39225396825396824, "grad_norm": 0.1328125, "learning_rate": 0.1, "loss": 2.5014781951904297, "step": 12356 }, { "epoch": 0.39231746031746034, "grad_norm": 0.125, "learning_rate": 0.1, "loss": 2.479877471923828, "step": 12358 }, { "epoch": 0.3923809523809524, "grad_norm": 0.099609375, "learning_rate": 0.1, "loss": 2.460399627685547, "step": 12360 }, { "epoch": 0.39244444444444443, "grad_norm": 0.236328125, "learning_rate": 0.1, "loss": 2.4771628379821777, "step": 12362 }, { "epoch": 0.39250793650793653, "grad_norm": 0.09619140625, "learning_rate": 0.1, "loss": 2.5179996490478516, "step": 12364 }, { "epoch": 0.39257142857142857, "grad_norm": 0.158203125, "learning_rate": 0.1, "loss": 2.4573960304260254, "step": 12366 }, { "epoch": 0.3926349206349206, "grad_norm": 0.07666015625, "learning_rate": 0.1, "loss": 2.488504409790039, "step": 12368 }, { "epoch": 0.3926984126984127, "grad_norm": 0.279296875, "learning_rate": 0.1, "loss": 2.48992657661438, "step": 12370 }, { "epoch": 0.39276190476190476, "grad_norm": 0.443359375, "learning_rate": 0.1, "loss": 2.4311540126800537, "step": 12372 }, { "epoch": 0.3928253968253968, "grad_norm": 0.0859375, "learning_rate": 0.1, "loss": 2.4825241565704346, "step": 12374 }, { "epoch": 0.3928888888888889, "grad_norm": 0.140625, "learning_rate": 0.1, "loss": 2.4729347229003906, "step": 12376 }, { "epoch": 0.39295238095238094, "grad_norm": 0.1484375, "learning_rate": 0.1, "loss": 2.4759974479675293, "step": 12378 }, { "epoch": 0.39301587301587304, "grad_norm": 0.05712890625, "learning_rate": 0.1, "loss": 2.490987539291382, "step": 12380 }, { "epoch": 0.3930793650793651, "grad_norm": 0.0595703125, "learning_rate": 0.1, "loss": 2.4801199436187744, "step": 12382 }, { "epoch": 0.3931428571428571, "grad_norm": 0.0966796875, "learning_rate": 0.1, "loss": 2.4446208477020264, "step": 12384 }, { "epoch": 0.3932063492063492, "grad_norm": 0.30078125, "learning_rate": 0.1, "loss": 2.462266683578491, "step": 12386 }, { "epoch": 0.39326984126984127, "grad_norm": 0.55859375, "learning_rate": 0.1, "loss": 2.4918887615203857, "step": 12388 }, { "epoch": 0.3933333333333333, "grad_norm": 0.126953125, "learning_rate": 0.1, "loss": 2.4611899852752686, "step": 12390 }, { "epoch": 0.3933968253968254, "grad_norm": 0.1796875, "learning_rate": 0.1, "loss": 2.4827232360839844, "step": 12392 }, { "epoch": 0.39346031746031745, "grad_norm": 0.1640625, "learning_rate": 0.1, "loss": 2.4664206504821777, "step": 12394 }, { "epoch": 0.3935238095238095, "grad_norm": 0.09375, "learning_rate": 0.1, "loss": 2.463472843170166, "step": 12396 }, { "epoch": 0.3935873015873016, "grad_norm": 0.146484375, "learning_rate": 0.1, "loss": 2.452786445617676, "step": 12398 }, { "epoch": 0.39365079365079364, "grad_norm": 0.16015625, "learning_rate": 0.1, "loss": 2.477893590927124, "step": 12400 }, { "epoch": 0.39371428571428574, "grad_norm": 0.3359375, "learning_rate": 0.1, "loss": 2.4455089569091797, "step": 12402 }, { "epoch": 0.3937777777777778, "grad_norm": 0.53515625, "learning_rate": 0.1, "loss": 2.4739983081817627, "step": 12404 }, { "epoch": 0.3938412698412698, "grad_norm": 0.2197265625, "learning_rate": 0.1, "loss": 2.463010549545288, "step": 12406 }, { "epoch": 0.3939047619047619, "grad_norm": 0.09765625, "learning_rate": 0.1, "loss": 2.485477924346924, "step": 12408 }, { "epoch": 0.39396825396825397, "grad_norm": 0.11376953125, "learning_rate": 0.1, "loss": 2.4666271209716797, "step": 12410 }, { "epoch": 0.394031746031746, "grad_norm": 0.11962890625, "learning_rate": 0.1, "loss": 2.4628119468688965, "step": 12412 }, { "epoch": 0.3940952380952381, "grad_norm": 0.283203125, "learning_rate": 0.1, "loss": 2.4641313552856445, "step": 12414 }, { "epoch": 0.39415873015873015, "grad_norm": 0.37890625, "learning_rate": 0.1, "loss": 2.4827420711517334, "step": 12416 }, { "epoch": 0.3942222222222222, "grad_norm": 0.26953125, "learning_rate": 0.1, "loss": 2.471240520477295, "step": 12418 }, { "epoch": 0.3942857142857143, "grad_norm": 0.10791015625, "learning_rate": 0.1, "loss": 2.4685475826263428, "step": 12420 }, { "epoch": 0.39434920634920634, "grad_norm": 0.12158203125, "learning_rate": 0.1, "loss": 2.4665637016296387, "step": 12422 }, { "epoch": 0.39441269841269844, "grad_norm": 0.2177734375, "learning_rate": 0.1, "loss": 2.4597582817077637, "step": 12424 }, { "epoch": 0.3944761904761905, "grad_norm": 0.06982421875, "learning_rate": 0.1, "loss": 2.4702558517456055, "step": 12426 }, { "epoch": 0.3945396825396825, "grad_norm": 0.1337890625, "learning_rate": 0.1, "loss": 2.458078145980835, "step": 12428 }, { "epoch": 0.3946031746031746, "grad_norm": 0.07568359375, "learning_rate": 0.1, "loss": 2.460068702697754, "step": 12430 }, { "epoch": 0.39466666666666667, "grad_norm": 0.283203125, "learning_rate": 0.1, "loss": 2.4554131031036377, "step": 12432 }, { "epoch": 0.3947301587301587, "grad_norm": 0.08056640625, "learning_rate": 0.1, "loss": 2.488631248474121, "step": 12434 }, { "epoch": 0.3947936507936508, "grad_norm": 0.205078125, "learning_rate": 0.1, "loss": 2.4419941902160645, "step": 12436 }, { "epoch": 0.39485714285714285, "grad_norm": 0.25, "learning_rate": 0.1, "loss": 2.4841761589050293, "step": 12438 }, { "epoch": 0.3949206349206349, "grad_norm": 0.26171875, "learning_rate": 0.1, "loss": 2.4492082595825195, "step": 12440 }, { "epoch": 0.394984126984127, "grad_norm": 0.26953125, "learning_rate": 0.1, "loss": 2.447962760925293, "step": 12442 }, { "epoch": 0.39504761904761904, "grad_norm": 0.255859375, "learning_rate": 0.1, "loss": 2.4587724208831787, "step": 12444 }, { "epoch": 0.39511111111111114, "grad_norm": 0.0712890625, "learning_rate": 0.1, "loss": 2.462385892868042, "step": 12446 }, { "epoch": 0.3951746031746032, "grad_norm": 0.057373046875, "learning_rate": 0.1, "loss": 2.4769206047058105, "step": 12448 }, { "epoch": 0.3952380952380952, "grad_norm": 0.26953125, "learning_rate": 0.1, "loss": 2.448030948638916, "step": 12450 }, { "epoch": 0.3953015873015873, "grad_norm": 0.5, "learning_rate": 0.1, "loss": 2.474348783493042, "step": 12452 }, { "epoch": 0.39536507936507936, "grad_norm": 0.1259765625, "learning_rate": 0.1, "loss": 2.4807779788970947, "step": 12454 }, { "epoch": 0.3954285714285714, "grad_norm": 0.06884765625, "learning_rate": 0.1, "loss": 2.4678142070770264, "step": 12456 }, { "epoch": 0.3954920634920635, "grad_norm": 0.1591796875, "learning_rate": 0.1, "loss": 2.4569032192230225, "step": 12458 }, { "epoch": 0.39555555555555555, "grad_norm": 0.224609375, "learning_rate": 0.1, "loss": 2.476902484893799, "step": 12460 }, { "epoch": 0.3956190476190476, "grad_norm": 0.140625, "learning_rate": 0.1, "loss": 2.4690651893615723, "step": 12462 }, { "epoch": 0.3956825396825397, "grad_norm": 0.2451171875, "learning_rate": 0.1, "loss": 2.4898293018341064, "step": 12464 }, { "epoch": 0.39574603174603173, "grad_norm": 0.375, "learning_rate": 0.1, "loss": 2.4907736778259277, "step": 12466 }, { "epoch": 0.39580952380952383, "grad_norm": 0.130859375, "learning_rate": 0.1, "loss": 2.4488649368286133, "step": 12468 }, { "epoch": 0.3958730158730159, "grad_norm": 0.1298828125, "learning_rate": 0.1, "loss": 2.454991102218628, "step": 12470 }, { "epoch": 0.3959365079365079, "grad_norm": 0.07275390625, "learning_rate": 0.1, "loss": 2.489849328994751, "step": 12472 }, { "epoch": 0.396, "grad_norm": 0.1513671875, "learning_rate": 0.1, "loss": 2.4257235527038574, "step": 12474 }, { "epoch": 0.39606349206349206, "grad_norm": 0.056396484375, "learning_rate": 0.1, "loss": 2.471559762954712, "step": 12476 }, { "epoch": 0.3961269841269841, "grad_norm": 0.2060546875, "learning_rate": 0.1, "loss": 2.4803552627563477, "step": 12478 }, { "epoch": 0.3961904761904762, "grad_norm": 0.451171875, "learning_rate": 0.1, "loss": 2.445671319961548, "step": 12480 }, { "epoch": 0.39625396825396825, "grad_norm": 0.1923828125, "learning_rate": 0.1, "loss": 2.468325138092041, "step": 12482 }, { "epoch": 0.3963174603174603, "grad_norm": 0.11865234375, "learning_rate": 0.1, "loss": 2.4626095294952393, "step": 12484 }, { "epoch": 0.3963809523809524, "grad_norm": 0.0625, "learning_rate": 0.1, "loss": 2.4606876373291016, "step": 12486 }, { "epoch": 0.39644444444444443, "grad_norm": 0.333984375, "learning_rate": 0.1, "loss": 2.4665181636810303, "step": 12488 }, { "epoch": 0.39650793650793653, "grad_norm": 0.50390625, "learning_rate": 0.1, "loss": 2.468353748321533, "step": 12490 }, { "epoch": 0.3965714285714286, "grad_norm": 0.2158203125, "learning_rate": 0.1, "loss": 2.486236095428467, "step": 12492 }, { "epoch": 0.3966349206349206, "grad_norm": 0.0869140625, "learning_rate": 0.1, "loss": 2.473392963409424, "step": 12494 }, { "epoch": 0.3966984126984127, "grad_norm": 0.10693359375, "learning_rate": 0.1, "loss": 2.485105514526367, "step": 12496 }, { "epoch": 0.39676190476190476, "grad_norm": 0.1826171875, "learning_rate": 0.1, "loss": 2.4699769020080566, "step": 12498 }, { "epoch": 0.3968253968253968, "grad_norm": 0.1806640625, "learning_rate": 0.1, "loss": 2.4763691425323486, "step": 12500 }, { "epoch": 0.3968888888888889, "grad_norm": 0.07568359375, "learning_rate": 0.1, "loss": 2.491389274597168, "step": 12502 }, { "epoch": 0.39695238095238095, "grad_norm": 0.1025390625, "learning_rate": 0.1, "loss": 2.481797218322754, "step": 12504 }, { "epoch": 0.397015873015873, "grad_norm": 0.14453125, "learning_rate": 0.1, "loss": 2.5158865451812744, "step": 12506 }, { "epoch": 0.3970793650793651, "grad_norm": 0.2021484375, "learning_rate": 0.1, "loss": 2.4983389377593994, "step": 12508 }, { "epoch": 0.39714285714285713, "grad_norm": 0.1201171875, "learning_rate": 0.1, "loss": 2.496899127960205, "step": 12510 }, { "epoch": 0.39720634920634923, "grad_norm": 0.087890625, "learning_rate": 0.1, "loss": 2.499006986618042, "step": 12512 }, { "epoch": 0.3972698412698413, "grad_norm": 0.11865234375, "learning_rate": 0.1, "loss": 2.483018636703491, "step": 12514 }, { "epoch": 0.3973333333333333, "grad_norm": 0.384765625, "learning_rate": 0.1, "loss": 2.4755356311798096, "step": 12516 }, { "epoch": 0.3973968253968254, "grad_norm": 0.482421875, "learning_rate": 0.1, "loss": 2.4969849586486816, "step": 12518 }, { "epoch": 0.39746031746031746, "grad_norm": 0.06591796875, "learning_rate": 0.1, "loss": 2.481879234313965, "step": 12520 }, { "epoch": 0.3975238095238095, "grad_norm": 0.212890625, "learning_rate": 0.1, "loss": 2.507845401763916, "step": 12522 }, { "epoch": 0.3975873015873016, "grad_norm": 0.2080078125, "learning_rate": 0.1, "loss": 2.514796018600464, "step": 12524 }, { "epoch": 0.39765079365079364, "grad_norm": 0.255859375, "learning_rate": 0.1, "loss": 2.486309766769409, "step": 12526 }, { "epoch": 0.3977142857142857, "grad_norm": 0.2412109375, "learning_rate": 0.1, "loss": 2.4785990715026855, "step": 12528 }, { "epoch": 0.3977777777777778, "grad_norm": 0.294921875, "learning_rate": 0.1, "loss": 2.4831855297088623, "step": 12530 }, { "epoch": 0.39784126984126983, "grad_norm": 0.19921875, "learning_rate": 0.1, "loss": 2.4929800033569336, "step": 12532 }, { "epoch": 0.3979047619047619, "grad_norm": 0.2158203125, "learning_rate": 0.1, "loss": 2.5137693881988525, "step": 12534 }, { "epoch": 0.39796825396825397, "grad_norm": 0.12890625, "learning_rate": 0.1, "loss": 2.4880499839782715, "step": 12536 }, { "epoch": 0.398031746031746, "grad_norm": 0.423828125, "learning_rate": 0.1, "loss": 2.503533124923706, "step": 12538 }, { "epoch": 0.3980952380952381, "grad_norm": 0.19921875, "learning_rate": 0.1, "loss": 2.480361223220825, "step": 12540 }, { "epoch": 0.39815873015873016, "grad_norm": 0.205078125, "learning_rate": 0.1, "loss": 2.4782419204711914, "step": 12542 }, { "epoch": 0.3982222222222222, "grad_norm": 0.1572265625, "learning_rate": 0.1, "loss": 2.463319778442383, "step": 12544 }, { "epoch": 0.3982857142857143, "grad_norm": 0.09765625, "learning_rate": 0.1, "loss": 2.4680655002593994, "step": 12546 }, { "epoch": 0.39834920634920634, "grad_norm": 0.08740234375, "learning_rate": 0.1, "loss": 2.4729971885681152, "step": 12548 }, { "epoch": 0.3984126984126984, "grad_norm": 0.126953125, "learning_rate": 0.1, "loss": 2.4643752574920654, "step": 12550 }, { "epoch": 0.3984761904761905, "grad_norm": 0.1650390625, "learning_rate": 0.1, "loss": 2.4500062465667725, "step": 12552 }, { "epoch": 0.3985396825396825, "grad_norm": 0.2021484375, "learning_rate": 0.1, "loss": 2.4672186374664307, "step": 12554 }, { "epoch": 0.3986031746031746, "grad_norm": 0.38671875, "learning_rate": 0.1, "loss": 2.4818828105926514, "step": 12556 }, { "epoch": 0.39866666666666667, "grad_norm": 0.2578125, "learning_rate": 0.1, "loss": 2.4652960300445557, "step": 12558 }, { "epoch": 0.3987301587301587, "grad_norm": 0.11474609375, "learning_rate": 0.1, "loss": 2.433884382247925, "step": 12560 }, { "epoch": 0.3987936507936508, "grad_norm": 0.05712890625, "learning_rate": 0.1, "loss": 2.424755811691284, "step": 12562 }, { "epoch": 0.39885714285714285, "grad_norm": 0.07568359375, "learning_rate": 0.1, "loss": 2.430487632751465, "step": 12564 }, { "epoch": 0.3989206349206349, "grad_norm": 0.055908203125, "learning_rate": 0.1, "loss": 2.4455268383026123, "step": 12566 }, { "epoch": 0.398984126984127, "grad_norm": 0.23828125, "learning_rate": 0.1, "loss": 2.4371044635772705, "step": 12568 }, { "epoch": 0.39904761904761904, "grad_norm": 0.546875, "learning_rate": 0.1, "loss": 2.4374101161956787, "step": 12570 }, { "epoch": 0.39911111111111114, "grad_norm": 0.21875, "learning_rate": 0.1, "loss": 2.4382810592651367, "step": 12572 }, { "epoch": 0.3991746031746032, "grad_norm": 0.056884765625, "learning_rate": 0.1, "loss": 2.4031074047088623, "step": 12574 }, { "epoch": 0.3992380952380952, "grad_norm": 0.0517578125, "learning_rate": 0.1, "loss": 2.41848087310791, "step": 12576 }, { "epoch": 0.3993015873015873, "grad_norm": 0.12890625, "learning_rate": 0.1, "loss": 2.432514190673828, "step": 12578 }, { "epoch": 0.39936507936507937, "grad_norm": 0.4296875, "learning_rate": 0.1, "loss": 2.4152355194091797, "step": 12580 }, { "epoch": 0.3994285714285714, "grad_norm": 0.326171875, "learning_rate": 0.1, "loss": 2.3761203289031982, "step": 12582 }, { "epoch": 0.3994920634920635, "grad_norm": 0.06103515625, "learning_rate": 0.1, "loss": 2.4018025398254395, "step": 12584 }, { "epoch": 0.39955555555555555, "grad_norm": 0.07080078125, "learning_rate": 0.1, "loss": 2.386880874633789, "step": 12586 }, { "epoch": 0.3996190476190476, "grad_norm": 0.1845703125, "learning_rate": 0.1, "loss": 2.397996425628662, "step": 12588 }, { "epoch": 0.3996825396825397, "grad_norm": 0.341796875, "learning_rate": 0.1, "loss": 2.398606300354004, "step": 12590 }, { "epoch": 0.39974603174603174, "grad_norm": 0.0771484375, "learning_rate": 0.1, "loss": 2.370978832244873, "step": 12592 }, { "epoch": 0.39980952380952384, "grad_norm": 0.10205078125, "learning_rate": 0.1, "loss": 2.37992262840271, "step": 12594 }, { "epoch": 0.3998730158730159, "grad_norm": 0.1435546875, "learning_rate": 0.1, "loss": 2.378612995147705, "step": 12596 }, { "epoch": 0.3999365079365079, "grad_norm": 0.12060546875, "learning_rate": 0.1, "loss": 2.3547215461730957, "step": 12598 }, { "epoch": 0.4, "grad_norm": 0.08740234375, "learning_rate": 0.1, "loss": 2.359011650085449, "step": 12600 }, { "epoch": 0.4, "eval_loss": 1.7705790996551514, "eval_runtime": 105.9037, "eval_samples_per_second": 10.028, "eval_steps_per_second": 2.512, "step": 12600 }, { "epoch": 0.40006349206349207, "grad_norm": 0.2060546875, "learning_rate": 0.1, "loss": 2.377570629119873, "step": 12602 }, { "epoch": 0.4001269841269841, "grad_norm": 0.0693359375, "learning_rate": 0.1, "loss": 2.394019365310669, "step": 12604 }, { "epoch": 0.4001904761904762, "grad_norm": 0.1953125, "learning_rate": 0.1, "loss": 2.3574202060699463, "step": 12606 }, { "epoch": 0.40025396825396825, "grad_norm": 0.1513671875, "learning_rate": 0.1, "loss": 2.33833646774292, "step": 12608 }, { "epoch": 0.4003174603174603, "grad_norm": 0.33203125, "learning_rate": 0.1, "loss": 2.369246244430542, "step": 12610 }, { "epoch": 0.4003809523809524, "grad_norm": 0.181640625, "learning_rate": 0.1, "loss": 2.3324646949768066, "step": 12612 }, { "epoch": 0.40044444444444444, "grad_norm": 0.2021484375, "learning_rate": 0.1, "loss": 2.342609405517578, "step": 12614 }, { "epoch": 0.40050793650793653, "grad_norm": 0.208984375, "learning_rate": 0.1, "loss": 2.3590927124023438, "step": 12616 }, { "epoch": 0.4005714285714286, "grad_norm": 0.265625, "learning_rate": 0.1, "loss": 2.340346097946167, "step": 12618 }, { "epoch": 0.4006349206349206, "grad_norm": 0.34765625, "learning_rate": 0.1, "loss": 2.324613571166992, "step": 12620 }, { "epoch": 0.4006984126984127, "grad_norm": 0.09521484375, "learning_rate": 0.1, "loss": 2.3287599086761475, "step": 12622 }, { "epoch": 0.40076190476190476, "grad_norm": 0.1884765625, "learning_rate": 0.1, "loss": 2.3095972537994385, "step": 12624 }, { "epoch": 0.4008253968253968, "grad_norm": 0.06591796875, "learning_rate": 0.1, "loss": 2.3337745666503906, "step": 12626 }, { "epoch": 0.4008888888888889, "grad_norm": 0.0947265625, "learning_rate": 0.1, "loss": 2.3558530807495117, "step": 12628 }, { "epoch": 0.40095238095238095, "grad_norm": 0.1474609375, "learning_rate": 0.1, "loss": 2.3162124156951904, "step": 12630 }, { "epoch": 0.401015873015873, "grad_norm": 0.20703125, "learning_rate": 0.1, "loss": 2.35158634185791, "step": 12632 }, { "epoch": 0.4010793650793651, "grad_norm": 0.123046875, "learning_rate": 0.1, "loss": 2.3345530033111572, "step": 12634 }, { "epoch": 0.40114285714285713, "grad_norm": 0.212890625, "learning_rate": 0.1, "loss": 2.3051955699920654, "step": 12636 }, { "epoch": 0.40120634920634923, "grad_norm": 0.19140625, "learning_rate": 0.1, "loss": 2.3525850772857666, "step": 12638 }, { "epoch": 0.4012698412698413, "grad_norm": 0.0791015625, "learning_rate": 0.1, "loss": 2.3211631774902344, "step": 12640 }, { "epoch": 0.4013333333333333, "grad_norm": 0.193359375, "learning_rate": 0.1, "loss": 2.300722122192383, "step": 12642 }, { "epoch": 0.4013968253968254, "grad_norm": 0.14453125, "learning_rate": 0.1, "loss": 2.308464288711548, "step": 12644 }, { "epoch": 0.40146031746031746, "grad_norm": 0.265625, "learning_rate": 0.1, "loss": 2.2925209999084473, "step": 12646 }, { "epoch": 0.4015238095238095, "grad_norm": 0.29296875, "learning_rate": 0.1, "loss": 2.31681227684021, "step": 12648 }, { "epoch": 0.4015873015873016, "grad_norm": 0.076171875, "learning_rate": 0.1, "loss": 2.309741973876953, "step": 12650 }, { "epoch": 0.40165079365079365, "grad_norm": 0.154296875, "learning_rate": 0.1, "loss": 2.2926597595214844, "step": 12652 }, { "epoch": 0.4017142857142857, "grad_norm": 0.1904296875, "learning_rate": 0.1, "loss": 2.289318323135376, "step": 12654 }, { "epoch": 0.4017777777777778, "grad_norm": 0.19140625, "learning_rate": 0.1, "loss": 2.311735153198242, "step": 12656 }, { "epoch": 0.40184126984126983, "grad_norm": 0.08203125, "learning_rate": 0.1, "loss": 2.292684316635132, "step": 12658 }, { "epoch": 0.40190476190476193, "grad_norm": 0.134765625, "learning_rate": 0.1, "loss": 2.2918014526367188, "step": 12660 }, { "epoch": 0.401968253968254, "grad_norm": 0.4375, "learning_rate": 0.1, "loss": 2.315018892288208, "step": 12662 }, { "epoch": 0.402031746031746, "grad_norm": 0.306640625, "learning_rate": 0.1, "loss": 2.2844858169555664, "step": 12664 }, { "epoch": 0.4020952380952381, "grad_norm": 0.267578125, "learning_rate": 0.1, "loss": 2.2867681980133057, "step": 12666 }, { "epoch": 0.40215873015873016, "grad_norm": 0.1259765625, "learning_rate": 0.1, "loss": 2.2710859775543213, "step": 12668 }, { "epoch": 0.4022222222222222, "grad_norm": 0.16796875, "learning_rate": 0.1, "loss": 2.2854208946228027, "step": 12670 }, { "epoch": 0.4022857142857143, "grad_norm": 0.043212890625, "learning_rate": 0.1, "loss": 2.2909247875213623, "step": 12672 }, { "epoch": 0.40234920634920635, "grad_norm": 0.21875, "learning_rate": 0.1, "loss": 2.272588014602661, "step": 12674 }, { "epoch": 0.4024126984126984, "grad_norm": 0.318359375, "learning_rate": 0.1, "loss": 2.2849819660186768, "step": 12676 }, { "epoch": 0.4024761904761905, "grad_norm": 0.2294921875, "learning_rate": 0.1, "loss": 2.2895278930664062, "step": 12678 }, { "epoch": 0.40253968253968253, "grad_norm": 0.07958984375, "learning_rate": 0.1, "loss": 2.2852509021759033, "step": 12680 }, { "epoch": 0.40260317460317463, "grad_norm": 0.06640625, "learning_rate": 0.1, "loss": 2.284451484680176, "step": 12682 }, { "epoch": 0.4026666666666667, "grad_norm": 0.1552734375, "learning_rate": 0.1, "loss": 2.2716317176818848, "step": 12684 }, { "epoch": 0.4027301587301587, "grad_norm": 0.095703125, "learning_rate": 0.1, "loss": 2.2610225677490234, "step": 12686 }, { "epoch": 0.4027936507936508, "grad_norm": 0.138671875, "learning_rate": 0.1, "loss": 2.2757816314697266, "step": 12688 }, { "epoch": 0.40285714285714286, "grad_norm": 0.1494140625, "learning_rate": 0.1, "loss": 2.2554984092712402, "step": 12690 }, { "epoch": 0.4029206349206349, "grad_norm": 0.353515625, "learning_rate": 0.1, "loss": 2.2480530738830566, "step": 12692 }, { "epoch": 0.402984126984127, "grad_norm": 0.10888671875, "learning_rate": 0.1, "loss": 2.276384115219116, "step": 12694 }, { "epoch": 0.40304761904761904, "grad_norm": 0.140625, "learning_rate": 0.1, "loss": 2.269636869430542, "step": 12696 }, { "epoch": 0.4031111111111111, "grad_norm": 0.19921875, "learning_rate": 0.1, "loss": 2.2666258811950684, "step": 12698 }, { "epoch": 0.4031746031746032, "grad_norm": 0.06884765625, "learning_rate": 0.1, "loss": 2.276977062225342, "step": 12700 }, { "epoch": 0.40323809523809523, "grad_norm": 0.125, "learning_rate": 0.1, "loss": 2.2830371856689453, "step": 12702 }, { "epoch": 0.4033015873015873, "grad_norm": 0.1337890625, "learning_rate": 0.1, "loss": 2.277653932571411, "step": 12704 }, { "epoch": 0.40336507936507937, "grad_norm": 0.123046875, "learning_rate": 0.1, "loss": 2.3027429580688477, "step": 12706 }, { "epoch": 0.4034285714285714, "grad_norm": 0.12451171875, "learning_rate": 0.1, "loss": 2.2705657482147217, "step": 12708 }, { "epoch": 0.4034920634920635, "grad_norm": 0.314453125, "learning_rate": 0.1, "loss": 2.2589237689971924, "step": 12710 }, { "epoch": 0.40355555555555556, "grad_norm": 0.298828125, "learning_rate": 0.1, "loss": 2.239840030670166, "step": 12712 }, { "epoch": 0.4036190476190476, "grad_norm": 0.0830078125, "learning_rate": 0.1, "loss": 2.250976324081421, "step": 12714 }, { "epoch": 0.4036825396825397, "grad_norm": 0.09228515625, "learning_rate": 0.1, "loss": 2.246659517288208, "step": 12716 }, { "epoch": 0.40374603174603174, "grad_norm": 0.1494140625, "learning_rate": 0.1, "loss": 2.259284496307373, "step": 12718 }, { "epoch": 0.4038095238095238, "grad_norm": 0.13671875, "learning_rate": 0.1, "loss": 2.2647953033447266, "step": 12720 }, { "epoch": 0.4038730158730159, "grad_norm": 0.228515625, "learning_rate": 0.1, "loss": 2.234811544418335, "step": 12722 }, { "epoch": 0.4039365079365079, "grad_norm": 0.2041015625, "learning_rate": 0.1, "loss": 2.2175509929656982, "step": 12724 }, { "epoch": 0.404, "grad_norm": 0.1337890625, "learning_rate": 0.1, "loss": 2.2525484561920166, "step": 12726 }, { "epoch": 0.40406349206349207, "grad_norm": 0.12109375, "learning_rate": 0.1, "loss": 2.2338736057281494, "step": 12728 }, { "epoch": 0.4041269841269841, "grad_norm": 0.10302734375, "learning_rate": 0.1, "loss": 2.2381958961486816, "step": 12730 }, { "epoch": 0.4041904761904762, "grad_norm": 0.2177734375, "learning_rate": 0.1, "loss": 2.2337396144866943, "step": 12732 }, { "epoch": 0.40425396825396825, "grad_norm": 0.3828125, "learning_rate": 0.1, "loss": 2.2588319778442383, "step": 12734 }, { "epoch": 0.4043174603174603, "grad_norm": 0.279296875, "learning_rate": 0.1, "loss": 2.2670295238494873, "step": 12736 }, { "epoch": 0.4043809523809524, "grad_norm": 0.0869140625, "learning_rate": 0.1, "loss": 2.2243025302886963, "step": 12738 }, { "epoch": 0.40444444444444444, "grad_norm": 0.07861328125, "learning_rate": 0.1, "loss": 2.251145362854004, "step": 12740 }, { "epoch": 0.4045079365079365, "grad_norm": 0.11962890625, "learning_rate": 0.1, "loss": 2.2257883548736572, "step": 12742 }, { "epoch": 0.4045714285714286, "grad_norm": 0.2060546875, "learning_rate": 0.1, "loss": 2.225264549255371, "step": 12744 }, { "epoch": 0.4046349206349206, "grad_norm": 0.380859375, "learning_rate": 0.1, "loss": 2.2152347564697266, "step": 12746 }, { "epoch": 0.4046984126984127, "grad_norm": 0.39453125, "learning_rate": 0.1, "loss": 2.2306759357452393, "step": 12748 }, { "epoch": 0.40476190476190477, "grad_norm": 0.0791015625, "learning_rate": 0.1, "loss": 2.22837233543396, "step": 12750 }, { "epoch": 0.4048253968253968, "grad_norm": 0.1083984375, "learning_rate": 0.1, "loss": 2.257899522781372, "step": 12752 }, { "epoch": 0.4048888888888889, "grad_norm": 0.205078125, "learning_rate": 0.1, "loss": 2.246670722961426, "step": 12754 }, { "epoch": 0.40495238095238095, "grad_norm": 0.14453125, "learning_rate": 0.1, "loss": 2.2245774269104004, "step": 12756 }, { "epoch": 0.405015873015873, "grad_norm": 0.06494140625, "learning_rate": 0.1, "loss": 2.2274630069732666, "step": 12758 }, { "epoch": 0.4050793650793651, "grad_norm": 0.052490234375, "learning_rate": 0.1, "loss": 2.240325927734375, "step": 12760 }, { "epoch": 0.40514285714285714, "grad_norm": 0.13671875, "learning_rate": 0.1, "loss": 2.2673754692077637, "step": 12762 }, { "epoch": 0.4052063492063492, "grad_norm": 0.06494140625, "learning_rate": 0.1, "loss": 2.2353477478027344, "step": 12764 }, { "epoch": 0.4052698412698413, "grad_norm": 0.1416015625, "learning_rate": 0.1, "loss": 2.2503252029418945, "step": 12766 }, { "epoch": 0.4053333333333333, "grad_norm": 0.41015625, "learning_rate": 0.1, "loss": 2.2447316646575928, "step": 12768 }, { "epoch": 0.4053968253968254, "grad_norm": 0.177734375, "learning_rate": 0.1, "loss": 2.234184741973877, "step": 12770 }, { "epoch": 0.40546031746031747, "grad_norm": 0.06982421875, "learning_rate": 0.1, "loss": 2.243861198425293, "step": 12772 }, { "epoch": 0.4055238095238095, "grad_norm": 0.06982421875, "learning_rate": 0.1, "loss": 2.2590394020080566, "step": 12774 }, { "epoch": 0.4055873015873016, "grad_norm": 0.099609375, "learning_rate": 0.1, "loss": 2.251347064971924, "step": 12776 }, { "epoch": 0.40565079365079365, "grad_norm": 0.0791015625, "learning_rate": 0.1, "loss": 2.2699408531188965, "step": 12778 }, { "epoch": 0.4057142857142857, "grad_norm": 0.212890625, "learning_rate": 0.1, "loss": 2.25612735748291, "step": 12780 }, { "epoch": 0.4057777777777778, "grad_norm": 0.1953125, "learning_rate": 0.1, "loss": 2.2291111946105957, "step": 12782 }, { "epoch": 0.40584126984126984, "grad_norm": 0.1171875, "learning_rate": 0.1, "loss": 2.274329900741577, "step": 12784 }, { "epoch": 0.4059047619047619, "grad_norm": 0.126953125, "learning_rate": 0.1, "loss": 2.2519423961639404, "step": 12786 }, { "epoch": 0.405968253968254, "grad_norm": 0.07421875, "learning_rate": 0.1, "loss": 2.256042957305908, "step": 12788 }, { "epoch": 0.406031746031746, "grad_norm": 0.33203125, "learning_rate": 0.1, "loss": 2.272219657897949, "step": 12790 }, { "epoch": 0.4060952380952381, "grad_norm": 0.578125, "learning_rate": 0.1, "loss": 2.256260871887207, "step": 12792 }, { "epoch": 0.40615873015873016, "grad_norm": 0.08544921875, "learning_rate": 0.1, "loss": 2.251669406890869, "step": 12794 }, { "epoch": 0.4062222222222222, "grad_norm": 0.04541015625, "learning_rate": 0.1, "loss": 2.269117593765259, "step": 12796 }, { "epoch": 0.4062857142857143, "grad_norm": 0.08447265625, "learning_rate": 0.1, "loss": 2.2264370918273926, "step": 12798 }, { "epoch": 0.40634920634920635, "grad_norm": 0.125, "learning_rate": 0.1, "loss": 2.255627393722534, "step": 12800 }, { "epoch": 0.4064126984126984, "grad_norm": 0.16015625, "learning_rate": 0.1, "loss": 2.232905387878418, "step": 12802 }, { "epoch": 0.4064761904761905, "grad_norm": 0.177734375, "learning_rate": 0.1, "loss": 2.2764182090759277, "step": 12804 }, { "epoch": 0.40653968253968253, "grad_norm": 0.1083984375, "learning_rate": 0.1, "loss": 2.243446111679077, "step": 12806 }, { "epoch": 0.4066031746031746, "grad_norm": 0.072265625, "learning_rate": 0.1, "loss": 2.2535452842712402, "step": 12808 }, { "epoch": 0.4066666666666667, "grad_norm": 0.0458984375, "learning_rate": 0.1, "loss": 2.2059407234191895, "step": 12810 }, { "epoch": 0.4067301587301587, "grad_norm": 0.162109375, "learning_rate": 0.1, "loss": 2.250304698944092, "step": 12812 }, { "epoch": 0.4067936507936508, "grad_norm": 0.2099609375, "learning_rate": 0.1, "loss": 2.2253971099853516, "step": 12814 }, { "epoch": 0.40685714285714286, "grad_norm": 0.1376953125, "learning_rate": 0.1, "loss": 2.227365493774414, "step": 12816 }, { "epoch": 0.4069206349206349, "grad_norm": 0.0908203125, "learning_rate": 0.1, "loss": 2.25632905960083, "step": 12818 }, { "epoch": 0.406984126984127, "grad_norm": 0.18359375, "learning_rate": 0.1, "loss": 2.231961965560913, "step": 12820 }, { "epoch": 0.40704761904761905, "grad_norm": 0.34375, "learning_rate": 0.1, "loss": 2.2555220127105713, "step": 12822 }, { "epoch": 0.4071111111111111, "grad_norm": 0.1484375, "learning_rate": 0.1, "loss": 2.24263334274292, "step": 12824 }, { "epoch": 0.4071746031746032, "grad_norm": 0.173828125, "learning_rate": 0.1, "loss": 2.242009162902832, "step": 12826 }, { "epoch": 0.40723809523809523, "grad_norm": 0.1494140625, "learning_rate": 0.1, "loss": 2.246868371963501, "step": 12828 }, { "epoch": 0.4073015873015873, "grad_norm": 0.2060546875, "learning_rate": 0.1, "loss": 2.2295122146606445, "step": 12830 }, { "epoch": 0.4073650793650794, "grad_norm": 0.267578125, "learning_rate": 0.1, "loss": 2.263503313064575, "step": 12832 }, { "epoch": 0.4074285714285714, "grad_norm": 0.09423828125, "learning_rate": 0.1, "loss": 2.241060256958008, "step": 12834 }, { "epoch": 0.4074920634920635, "grad_norm": 0.376953125, "learning_rate": 0.1, "loss": 2.2220916748046875, "step": 12836 }, { "epoch": 0.40755555555555556, "grad_norm": 0.27734375, "learning_rate": 0.1, "loss": 2.221209764480591, "step": 12838 }, { "epoch": 0.4076190476190476, "grad_norm": 0.10546875, "learning_rate": 0.1, "loss": 2.230886220932007, "step": 12840 }, { "epoch": 0.4076825396825397, "grad_norm": 0.123046875, "learning_rate": 0.1, "loss": 2.2281064987182617, "step": 12842 }, { "epoch": 0.40774603174603175, "grad_norm": 0.09765625, "learning_rate": 0.1, "loss": 2.2533373832702637, "step": 12844 }, { "epoch": 0.4078095238095238, "grad_norm": 0.11279296875, "learning_rate": 0.1, "loss": 2.220545530319214, "step": 12846 }, { "epoch": 0.4078730158730159, "grad_norm": 0.1962890625, "learning_rate": 0.1, "loss": 2.207355499267578, "step": 12848 }, { "epoch": 0.40793650793650793, "grad_norm": 0.07568359375, "learning_rate": 0.1, "loss": 2.2466094493865967, "step": 12850 }, { "epoch": 0.408, "grad_norm": 0.083984375, "learning_rate": 0.1, "loss": 2.2127885818481445, "step": 12852 }, { "epoch": 0.4080634920634921, "grad_norm": 0.2216796875, "learning_rate": 0.1, "loss": 2.2296013832092285, "step": 12854 }, { "epoch": 0.4081269841269841, "grad_norm": 0.34765625, "learning_rate": 0.1, "loss": 2.2332053184509277, "step": 12856 }, { "epoch": 0.4081904761904762, "grad_norm": 0.11962890625, "learning_rate": 0.1, "loss": 2.2455201148986816, "step": 12858 }, { "epoch": 0.40825396825396826, "grad_norm": 0.2041015625, "learning_rate": 0.1, "loss": 2.2342209815979004, "step": 12860 }, { "epoch": 0.4083174603174603, "grad_norm": 0.134765625, "learning_rate": 0.1, "loss": 2.2207164764404297, "step": 12862 }, { "epoch": 0.4083809523809524, "grad_norm": 0.220703125, "learning_rate": 0.1, "loss": 2.2393648624420166, "step": 12864 }, { "epoch": 0.40844444444444444, "grad_norm": 0.146484375, "learning_rate": 0.1, "loss": 2.2382168769836426, "step": 12866 }, { "epoch": 0.4085079365079365, "grad_norm": 0.06689453125, "learning_rate": 0.1, "loss": 2.2354063987731934, "step": 12868 }, { "epoch": 0.4085714285714286, "grad_norm": 0.205078125, "learning_rate": 0.1, "loss": 2.2208573818206787, "step": 12870 }, { "epoch": 0.40863492063492063, "grad_norm": 0.06689453125, "learning_rate": 0.1, "loss": 2.2357020378112793, "step": 12872 }, { "epoch": 0.40869841269841267, "grad_norm": 0.1513671875, "learning_rate": 0.1, "loss": 2.2209126949310303, "step": 12874 }, { "epoch": 0.40876190476190477, "grad_norm": 0.310546875, "learning_rate": 0.1, "loss": 2.2232158184051514, "step": 12876 }, { "epoch": 0.4088253968253968, "grad_norm": 0.275390625, "learning_rate": 0.1, "loss": 2.1869778633117676, "step": 12878 }, { "epoch": 0.4088888888888889, "grad_norm": 0.1484375, "learning_rate": 0.1, "loss": 2.230013847351074, "step": 12880 }, { "epoch": 0.40895238095238096, "grad_norm": 0.1416015625, "learning_rate": 0.1, "loss": 2.2143027782440186, "step": 12882 }, { "epoch": 0.409015873015873, "grad_norm": 0.171875, "learning_rate": 0.1, "loss": 2.2395071983337402, "step": 12884 }, { "epoch": 0.4090793650793651, "grad_norm": 0.18359375, "learning_rate": 0.1, "loss": 2.2181894779205322, "step": 12886 }, { "epoch": 0.40914285714285714, "grad_norm": 0.1298828125, "learning_rate": 0.1, "loss": 2.237212657928467, "step": 12888 }, { "epoch": 0.4092063492063492, "grad_norm": 0.1787109375, "learning_rate": 0.1, "loss": 2.204676866531372, "step": 12890 }, { "epoch": 0.4092698412698413, "grad_norm": 0.3515625, "learning_rate": 0.1, "loss": 2.2483561038970947, "step": 12892 }, { "epoch": 0.4093333333333333, "grad_norm": 0.12109375, "learning_rate": 0.1, "loss": 2.2073922157287598, "step": 12894 }, { "epoch": 0.40939682539682537, "grad_norm": 0.1669921875, "learning_rate": 0.1, "loss": 2.2114458084106445, "step": 12896 }, { "epoch": 0.40946031746031747, "grad_norm": 0.2314453125, "learning_rate": 0.1, "loss": 2.2161312103271484, "step": 12898 }, { "epoch": 0.4095238095238095, "grad_norm": 0.07470703125, "learning_rate": 0.1, "loss": 2.237602472305298, "step": 12900 }, { "epoch": 0.4095873015873016, "grad_norm": 0.1396484375, "learning_rate": 0.1, "loss": 2.218475580215454, "step": 12902 }, { "epoch": 0.40965079365079365, "grad_norm": 0.1767578125, "learning_rate": 0.1, "loss": 2.231311798095703, "step": 12904 }, { "epoch": 0.4097142857142857, "grad_norm": 0.11328125, "learning_rate": 0.1, "loss": 2.2220139503479004, "step": 12906 }, { "epoch": 0.4097777777777778, "grad_norm": 0.10009765625, "learning_rate": 0.1, "loss": 2.226863384246826, "step": 12908 }, { "epoch": 0.40984126984126984, "grad_norm": 0.08056640625, "learning_rate": 0.1, "loss": 2.22263503074646, "step": 12910 }, { "epoch": 0.4099047619047619, "grad_norm": 0.25, "learning_rate": 0.1, "loss": 2.216628074645996, "step": 12912 }, { "epoch": 0.409968253968254, "grad_norm": 0.4765625, "learning_rate": 0.1, "loss": 2.2253456115722656, "step": 12914 } ], "logging_steps": 2, "max_steps": 31500, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 315, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4.277200399146157e+19, "train_batch_size": 4, "trial_name": null, "trial_params": null }