| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.9999700320656897, |
| "eval_steps": 500, |
| "global_step": 8342, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.0005993586862057599, |
| "grad_norm": 108.0, |
| "learning_rate": 5.988023952095808e-09, |
| "loss": 101.4288, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.0011987173724115198, |
| "grad_norm": 106.75, |
| "learning_rate": 1.1976047904191617e-08, |
| "loss": 102.372, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.0017980760586172794, |
| "grad_norm": 106.6875, |
| "learning_rate": 1.7964071856287425e-08, |
| "loss": 100.4109, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.0023974347448230396, |
| "grad_norm": 107.375, |
| "learning_rate": 2.3952095808383233e-08, |
| "loss": 102.0437, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.0029967934310287992, |
| "grad_norm": 102.5, |
| "learning_rate": 2.994011976047904e-08, |
| "loss": 100.0221, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.003596152117234559, |
| "grad_norm": 104.9375, |
| "learning_rate": 3.592814371257485e-08, |
| "loss": 102.2138, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.004195510803440319, |
| "grad_norm": 104.625, |
| "learning_rate": 4.191616766467065e-08, |
| "loss": 100.7864, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.004794869489646079, |
| "grad_norm": 101.5625, |
| "learning_rate": 4.7904191616766466e-08, |
| "loss": 100.7124, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.005394228175851838, |
| "grad_norm": 105.8125, |
| "learning_rate": 5.3892215568862274e-08, |
| "loss": 100.6161, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.0059935868620575984, |
| "grad_norm": 100.875, |
| "learning_rate": 5.988023952095808e-08, |
| "loss": 100.9569, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.0065929455482633586, |
| "grad_norm": 104.5625, |
| "learning_rate": 6.586826347305389e-08, |
| "loss": 99.5128, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.007192304234469118, |
| "grad_norm": 107.1875, |
| "learning_rate": 7.18562874251497e-08, |
| "loss": 100.0423, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.007791662920674878, |
| "grad_norm": 103.1875, |
| "learning_rate": 7.784431137724551e-08, |
| "loss": 99.8547, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.008391021606880638, |
| "grad_norm": 103.4375, |
| "learning_rate": 8.38323353293413e-08, |
| "loss": 99.0963, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.008990380293086398, |
| "grad_norm": 103.625, |
| "learning_rate": 8.982035928143712e-08, |
| "loss": 101.1167, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.009589738979292158, |
| "grad_norm": 105.25, |
| "learning_rate": 9.580838323353293e-08, |
| "loss": 98.1266, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.010189097665497917, |
| "grad_norm": 103.3125, |
| "learning_rate": 1.0179640718562874e-07, |
| "loss": 100.9546, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.010788456351703677, |
| "grad_norm": 104.4375, |
| "learning_rate": 1.0778443113772455e-07, |
| "loss": 99.5327, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.011387815037909437, |
| "grad_norm": 104.875, |
| "learning_rate": 1.1377245508982034e-07, |
| "loss": 99.6909, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.011987173724115197, |
| "grad_norm": 107.25, |
| "learning_rate": 1.1976047904191617e-07, |
| "loss": 100.4652, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.012586532410320957, |
| "grad_norm": 107.625, |
| "learning_rate": 1.2574850299401197e-07, |
| "loss": 100.3513, |
| "step": 105 |
| }, |
| { |
| "epoch": 0.013185891096526717, |
| "grad_norm": 107.625, |
| "learning_rate": 1.3173652694610778e-07, |
| "loss": 99.1105, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.013785249782732475, |
| "grad_norm": 106.8125, |
| "learning_rate": 1.377245508982036e-07, |
| "loss": 99.3605, |
| "step": 115 |
| }, |
| { |
| "epoch": 0.014384608468938236, |
| "grad_norm": 104.6875, |
| "learning_rate": 1.437125748502994e-07, |
| "loss": 100.7677, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.014983967155143996, |
| "grad_norm": 102.125, |
| "learning_rate": 1.4970059880239518e-07, |
| "loss": 98.4561, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.015583325841349756, |
| "grad_norm": 103.9375, |
| "learning_rate": 1.5568862275449102e-07, |
| "loss": 97.9862, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.016182684527555516, |
| "grad_norm": 105.625, |
| "learning_rate": 1.6167664670658682e-07, |
| "loss": 99.3458, |
| "step": 135 |
| }, |
| { |
| "epoch": 0.016782043213761276, |
| "grad_norm": 105.5, |
| "learning_rate": 1.676646706586826e-07, |
| "loss": 98.1435, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.017381401899967036, |
| "grad_norm": 106.875, |
| "learning_rate": 1.7365269461077844e-07, |
| "loss": 98.1245, |
| "step": 145 |
| }, |
| { |
| "epoch": 0.017980760586172796, |
| "grad_norm": 106.0, |
| "learning_rate": 1.7964071856287425e-07, |
| "loss": 97.9571, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.018580119272378556, |
| "grad_norm": 107.125, |
| "learning_rate": 1.8562874251497006e-07, |
| "loss": 99.0638, |
| "step": 155 |
| }, |
| { |
| "epoch": 0.019179477958584316, |
| "grad_norm": 104.875, |
| "learning_rate": 1.9161676646706586e-07, |
| "loss": 100.1442, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.019778836644790073, |
| "grad_norm": 103.0625, |
| "learning_rate": 1.9760479041916167e-07, |
| "loss": 99.5631, |
| "step": 165 |
| }, |
| { |
| "epoch": 0.020378195330995833, |
| "grad_norm": 103.25, |
| "learning_rate": 2.0359281437125748e-07, |
| "loss": 99.3583, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.020977554017201593, |
| "grad_norm": 111.0, |
| "learning_rate": 2.0958083832335326e-07, |
| "loss": 99.157, |
| "step": 175 |
| }, |
| { |
| "epoch": 0.021576912703407353, |
| "grad_norm": 108.0625, |
| "learning_rate": 2.155688622754491e-07, |
| "loss": 98.6387, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.022176271389613113, |
| "grad_norm": 107.625, |
| "learning_rate": 2.215568862275449e-07, |
| "loss": 98.7289, |
| "step": 185 |
| }, |
| { |
| "epoch": 0.022775630075818874, |
| "grad_norm": 106.0, |
| "learning_rate": 2.275449101796407e-07, |
| "loss": 99.7657, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.023374988762024634, |
| "grad_norm": 108.875, |
| "learning_rate": 2.3353293413173652e-07, |
| "loss": 99.5338, |
| "step": 195 |
| }, |
| { |
| "epoch": 0.023974347448230394, |
| "grad_norm": 105.3125, |
| "learning_rate": 2.3952095808383233e-07, |
| "loss": 98.2597, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.024573706134436154, |
| "grad_norm": 106.25, |
| "learning_rate": 2.455089820359281e-07, |
| "loss": 98.0168, |
| "step": 205 |
| }, |
| { |
| "epoch": 0.025173064820641914, |
| "grad_norm": 105.1875, |
| "learning_rate": 2.5149700598802395e-07, |
| "loss": 98.9083, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.025772423506847674, |
| "grad_norm": 102.9375, |
| "learning_rate": 2.5748502994011973e-07, |
| "loss": 98.8039, |
| "step": 215 |
| }, |
| { |
| "epoch": 0.026371782193053434, |
| "grad_norm": 102.875, |
| "learning_rate": 2.6347305389221556e-07, |
| "loss": 96.897, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.026971140879259194, |
| "grad_norm": 106.0, |
| "learning_rate": 2.694610778443114e-07, |
| "loss": 98.8567, |
| "step": 225 |
| }, |
| { |
| "epoch": 0.02757049956546495, |
| "grad_norm": 104.4375, |
| "learning_rate": 2.754491017964072e-07, |
| "loss": 99.9957, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.02816985825167071, |
| "grad_norm": 103.875, |
| "learning_rate": 2.8143712574850296e-07, |
| "loss": 98.0264, |
| "step": 235 |
| }, |
| { |
| "epoch": 0.02876921693787647, |
| "grad_norm": 107.0625, |
| "learning_rate": 2.874251497005988e-07, |
| "loss": 97.1543, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.02936857562408223, |
| "grad_norm": 103.3125, |
| "learning_rate": 2.934131736526946e-07, |
| "loss": 98.1825, |
| "step": 245 |
| }, |
| { |
| "epoch": 0.02996793431028799, |
| "grad_norm": 105.875, |
| "learning_rate": 2.9940119760479036e-07, |
| "loss": 98.2953, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.03056729299649375, |
| "grad_norm": 107.1875, |
| "learning_rate": 3.0538922155688625e-07, |
| "loss": 99.6612, |
| "step": 255 |
| }, |
| { |
| "epoch": 0.03116665168269951, |
| "grad_norm": 107.375, |
| "learning_rate": 3.1137724550898203e-07, |
| "loss": 97.6773, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.03176601036890527, |
| "grad_norm": 106.1875, |
| "learning_rate": 3.173652694610778e-07, |
| "loss": 98.2748, |
| "step": 265 |
| }, |
| { |
| "epoch": 0.03236536905511103, |
| "grad_norm": 106.0625, |
| "learning_rate": 3.2335329341317365e-07, |
| "loss": 97.7014, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.03296472774131679, |
| "grad_norm": 109.8125, |
| "learning_rate": 3.2934131736526943e-07, |
| "loss": 99.1549, |
| "step": 275 |
| }, |
| { |
| "epoch": 0.03356408642752255, |
| "grad_norm": 106.0, |
| "learning_rate": 3.353293413173652e-07, |
| "loss": 98.5528, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.03416344511372831, |
| "grad_norm": 107.3125, |
| "learning_rate": 3.413173652694611e-07, |
| "loss": 98.652, |
| "step": 285 |
| }, |
| { |
| "epoch": 0.03476280379993407, |
| "grad_norm": 102.625, |
| "learning_rate": 3.473053892215569e-07, |
| "loss": 99.1912, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.03536216248613983, |
| "grad_norm": 103.8125, |
| "learning_rate": 3.5329341317365266e-07, |
| "loss": 98.7347, |
| "step": 295 |
| }, |
| { |
| "epoch": 0.03596152117234559, |
| "grad_norm": 104.3125, |
| "learning_rate": 3.592814371257485e-07, |
| "loss": 98.0729, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.03656087985855135, |
| "grad_norm": 108.75, |
| "learning_rate": 3.652694610778443e-07, |
| "loss": 98.3793, |
| "step": 305 |
| }, |
| { |
| "epoch": 0.03716023854475711, |
| "grad_norm": 105.3125, |
| "learning_rate": 3.712574850299401e-07, |
| "loss": 99.3356, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.03775959723096287, |
| "grad_norm": 107.8125, |
| "learning_rate": 3.772455089820359e-07, |
| "loss": 96.9595, |
| "step": 315 |
| }, |
| { |
| "epoch": 0.03835895591716863, |
| "grad_norm": 107.6875, |
| "learning_rate": 3.8323353293413173e-07, |
| "loss": 97.5843, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.038958314603374386, |
| "grad_norm": 105.125, |
| "learning_rate": 3.8922155688622756e-07, |
| "loss": 100.0804, |
| "step": 325 |
| }, |
| { |
| "epoch": 0.039557673289580146, |
| "grad_norm": 108.9375, |
| "learning_rate": 3.9520958083832335e-07, |
| "loss": 96.4505, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.040157031975785906, |
| "grad_norm": 104.625, |
| "learning_rate": 4.0119760479041913e-07, |
| "loss": 98.6832, |
| "step": 335 |
| }, |
| { |
| "epoch": 0.040756390661991666, |
| "grad_norm": 109.1875, |
| "learning_rate": 4.0718562874251496e-07, |
| "loss": 97.3873, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.041355749348197426, |
| "grad_norm": 105.375, |
| "learning_rate": 4.1317365269461074e-07, |
| "loss": 96.7431, |
| "step": 345 |
| }, |
| { |
| "epoch": 0.04195510803440319, |
| "grad_norm": 108.0625, |
| "learning_rate": 4.191616766467065e-07, |
| "loss": 96.5581, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.04255446672060895, |
| "grad_norm": 105.875, |
| "learning_rate": 4.251497005988024e-07, |
| "loss": 97.4509, |
| "step": 355 |
| }, |
| { |
| "epoch": 0.04315382540681471, |
| "grad_norm": 104.375, |
| "learning_rate": 4.311377245508982e-07, |
| "loss": 98.1399, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.04375318409302047, |
| "grad_norm": 108.125, |
| "learning_rate": 4.37125748502994e-07, |
| "loss": 97.8897, |
| "step": 365 |
| }, |
| { |
| "epoch": 0.04435254277922623, |
| "grad_norm": 105.25, |
| "learning_rate": 4.431137724550898e-07, |
| "loss": 98.8462, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.04495190146543199, |
| "grad_norm": 107.625, |
| "learning_rate": 4.491017964071856e-07, |
| "loss": 98.7402, |
| "step": 375 |
| }, |
| { |
| "epoch": 0.04555126015163775, |
| "grad_norm": 109.4375, |
| "learning_rate": 4.550898203592814e-07, |
| "loss": 98.7856, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.04615061883784351, |
| "grad_norm": 108.3125, |
| "learning_rate": 4.6107784431137726e-07, |
| "loss": 98.0607, |
| "step": 385 |
| }, |
| { |
| "epoch": 0.04674997752404927, |
| "grad_norm": 106.25, |
| "learning_rate": 4.6706586826347305e-07, |
| "loss": 98.49, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.04734933621025503, |
| "grad_norm": 108.25, |
| "learning_rate": 4.7305389221556883e-07, |
| "loss": 97.775, |
| "step": 395 |
| }, |
| { |
| "epoch": 0.04794869489646079, |
| "grad_norm": 108.5, |
| "learning_rate": 4.790419161676647e-07, |
| "loss": 96.6219, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.04854805358266655, |
| "grad_norm": 105.875, |
| "learning_rate": 4.850299401197605e-07, |
| "loss": 98.3288, |
| "step": 405 |
| }, |
| { |
| "epoch": 0.04914741226887231, |
| "grad_norm": 104.875, |
| "learning_rate": 4.910179640718562e-07, |
| "loss": 98.0089, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.04974677095507807, |
| "grad_norm": 106.5625, |
| "learning_rate": 4.970059880239521e-07, |
| "loss": 98.042, |
| "step": 415 |
| }, |
| { |
| "epoch": 0.05034612964128383, |
| "grad_norm": 107.4375, |
| "learning_rate": 5.029940119760479e-07, |
| "loss": 98.1441, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.05094548832748959, |
| "grad_norm": 109.9375, |
| "learning_rate": 5.089820359281437e-07, |
| "loss": 98.7648, |
| "step": 425 |
| }, |
| { |
| "epoch": 0.05154484701369535, |
| "grad_norm": 110.4375, |
| "learning_rate": 5.149700598802395e-07, |
| "loss": 98.7029, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.05214420569990111, |
| "grad_norm": 107.8125, |
| "learning_rate": 5.209580838323353e-07, |
| "loss": 97.6725, |
| "step": 435 |
| }, |
| { |
| "epoch": 0.05274356438610687, |
| "grad_norm": 109.125, |
| "learning_rate": 5.269461077844311e-07, |
| "loss": 97.5752, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.05334292307231263, |
| "grad_norm": 103.4375, |
| "learning_rate": 5.329341317365269e-07, |
| "loss": 97.6427, |
| "step": 445 |
| }, |
| { |
| "epoch": 0.05394228175851839, |
| "grad_norm": 104.5625, |
| "learning_rate": 5.389221556886228e-07, |
| "loss": 99.5309, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.05454164044472414, |
| "grad_norm": 107.25, |
| "learning_rate": 5.449101796407185e-07, |
| "loss": 98.6651, |
| "step": 455 |
| }, |
| { |
| "epoch": 0.0551409991309299, |
| "grad_norm": 108.25, |
| "learning_rate": 5.508982035928144e-07, |
| "loss": 97.7356, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.05574035781713566, |
| "grad_norm": 108.3125, |
| "learning_rate": 5.568862275449101e-07, |
| "loss": 97.3537, |
| "step": 465 |
| }, |
| { |
| "epoch": 0.05633971650334142, |
| "grad_norm": 106.25, |
| "learning_rate": 5.628742514970059e-07, |
| "loss": 97.6986, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.05693907518954718, |
| "grad_norm": 107.0, |
| "learning_rate": 5.688622754491019e-07, |
| "loss": 97.4101, |
| "step": 475 |
| }, |
| { |
| "epoch": 0.05753843387575294, |
| "grad_norm": 106.25, |
| "learning_rate": 5.748502994011976e-07, |
| "loss": 98.0793, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.0581377925619587, |
| "grad_norm": 103.75, |
| "learning_rate": 5.808383233532934e-07, |
| "loss": 99.4187, |
| "step": 485 |
| }, |
| { |
| "epoch": 0.05873715124816446, |
| "grad_norm": 103.3125, |
| "learning_rate": 5.868263473053892e-07, |
| "loss": 98.6429, |
| "step": 490 |
| }, |
| { |
| "epoch": 0.05933650993437022, |
| "grad_norm": 107.5625, |
| "learning_rate": 5.92814371257485e-07, |
| "loss": 97.5, |
| "step": 495 |
| }, |
| { |
| "epoch": 0.05993586862057598, |
| "grad_norm": 105.5625, |
| "learning_rate": 5.988023952095807e-07, |
| "loss": 97.07, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.05993586862057598, |
| "eval_loss": 3.0657827854156494, |
| "eval_runtime": 402.7563, |
| "eval_samples_per_second": 1116.315, |
| "eval_steps_per_second": 34.887, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.06053522730678174, |
| "grad_norm": 109.75, |
| "learning_rate": 6.047904191616767e-07, |
| "loss": 97.7991, |
| "step": 505 |
| }, |
| { |
| "epoch": 0.0611345859929875, |
| "grad_norm": 108.125, |
| "learning_rate": 6.107784431137725e-07, |
| "loss": 97.4808, |
| "step": 510 |
| }, |
| { |
| "epoch": 0.06173394467919326, |
| "grad_norm": 103.8125, |
| "learning_rate": 6.167664670658682e-07, |
| "loss": 97.707, |
| "step": 515 |
| }, |
| { |
| "epoch": 0.06233330336539902, |
| "grad_norm": 108.375, |
| "learning_rate": 6.227544910179641e-07, |
| "loss": 98.3281, |
| "step": 520 |
| }, |
| { |
| "epoch": 0.06293266205160478, |
| "grad_norm": 110.3125, |
| "learning_rate": 6.287425149700598e-07, |
| "loss": 99.0089, |
| "step": 525 |
| }, |
| { |
| "epoch": 0.06353202073781054, |
| "grad_norm": 105.75, |
| "learning_rate": 6.347305389221556e-07, |
| "loss": 98.7936, |
| "step": 530 |
| }, |
| { |
| "epoch": 0.0641313794240163, |
| "grad_norm": 105.5, |
| "learning_rate": 6.407185628742516e-07, |
| "loss": 98.6227, |
| "step": 535 |
| }, |
| { |
| "epoch": 0.06473073811022206, |
| "grad_norm": 104.3125, |
| "learning_rate": 6.467065868263473e-07, |
| "loss": 97.955, |
| "step": 540 |
| }, |
| { |
| "epoch": 0.06533009679642782, |
| "grad_norm": 107.75, |
| "learning_rate": 6.526946107784431e-07, |
| "loss": 98.5175, |
| "step": 545 |
| }, |
| { |
| "epoch": 0.06592945548263358, |
| "grad_norm": 107.125, |
| "learning_rate": 6.586826347305389e-07, |
| "loss": 97.9932, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.06652881416883934, |
| "grad_norm": 105.75, |
| "learning_rate": 6.646706586826347e-07, |
| "loss": 99.2275, |
| "step": 555 |
| }, |
| { |
| "epoch": 0.0671281728550451, |
| "grad_norm": 104.9375, |
| "learning_rate": 6.706586826347304e-07, |
| "loss": 98.1961, |
| "step": 560 |
| }, |
| { |
| "epoch": 0.06772753154125086, |
| "grad_norm": 105.6875, |
| "learning_rate": 6.766467065868264e-07, |
| "loss": 98.7839, |
| "step": 565 |
| }, |
| { |
| "epoch": 0.06832689022745662, |
| "grad_norm": 110.25, |
| "learning_rate": 6.826347305389222e-07, |
| "loss": 98.6332, |
| "step": 570 |
| }, |
| { |
| "epoch": 0.06892624891366238, |
| "grad_norm": 105.25, |
| "learning_rate": 6.886227544910179e-07, |
| "loss": 100.4485, |
| "step": 575 |
| }, |
| { |
| "epoch": 0.06952560759986814, |
| "grad_norm": 111.75, |
| "learning_rate": 6.946107784431138e-07, |
| "loss": 98.7815, |
| "step": 580 |
| }, |
| { |
| "epoch": 0.0701249662860739, |
| "grad_norm": 107.125, |
| "learning_rate": 7.005988023952095e-07, |
| "loss": 98.5106, |
| "step": 585 |
| }, |
| { |
| "epoch": 0.07072432497227966, |
| "grad_norm": 106.375, |
| "learning_rate": 7.065868263473053e-07, |
| "loss": 97.8071, |
| "step": 590 |
| }, |
| { |
| "epoch": 0.07132368365848542, |
| "grad_norm": 106.875, |
| "learning_rate": 7.125748502994012e-07, |
| "loss": 98.2991, |
| "step": 595 |
| }, |
| { |
| "epoch": 0.07192304234469118, |
| "grad_norm": 105.9375, |
| "learning_rate": 7.18562874251497e-07, |
| "loss": 97.3966, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.07252240103089694, |
| "grad_norm": 110.0, |
| "learning_rate": 7.245508982035928e-07, |
| "loss": 100.179, |
| "step": 605 |
| }, |
| { |
| "epoch": 0.0731217597171027, |
| "grad_norm": 110.0625, |
| "learning_rate": 7.305389221556886e-07, |
| "loss": 98.9886, |
| "step": 610 |
| }, |
| { |
| "epoch": 0.07372111840330846, |
| "grad_norm": 106.25, |
| "learning_rate": 7.365269461077844e-07, |
| "loss": 98.379, |
| "step": 615 |
| }, |
| { |
| "epoch": 0.07432047708951423, |
| "grad_norm": 108.1875, |
| "learning_rate": 7.425149700598802e-07, |
| "loss": 98.8186, |
| "step": 620 |
| }, |
| { |
| "epoch": 0.07491983577571998, |
| "grad_norm": 109.375, |
| "learning_rate": 7.485029940119761e-07, |
| "loss": 98.6054, |
| "step": 625 |
| }, |
| { |
| "epoch": 0.07551919446192575, |
| "grad_norm": 111.0625, |
| "learning_rate": 7.544910179640718e-07, |
| "loss": 99.7864, |
| "step": 630 |
| }, |
| { |
| "epoch": 0.0761185531481315, |
| "grad_norm": 109.3125, |
| "learning_rate": 7.604790419161676e-07, |
| "loss": 99.6344, |
| "step": 635 |
| }, |
| { |
| "epoch": 0.07671791183433727, |
| "grad_norm": 107.25, |
| "learning_rate": 7.664670658682635e-07, |
| "loss": 98.9922, |
| "step": 640 |
| }, |
| { |
| "epoch": 0.07731727052054302, |
| "grad_norm": 104.25, |
| "learning_rate": 7.724550898203592e-07, |
| "loss": 100.2282, |
| "step": 645 |
| }, |
| { |
| "epoch": 0.07791662920674877, |
| "grad_norm": 108.9375, |
| "learning_rate": 7.784431137724551e-07, |
| "loss": 97.6066, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.07851598789295454, |
| "grad_norm": 109.1875, |
| "learning_rate": 7.844311377245509e-07, |
| "loss": 99.5096, |
| "step": 655 |
| }, |
| { |
| "epoch": 0.07911534657916029, |
| "grad_norm": 110.0, |
| "learning_rate": 7.904191616766467e-07, |
| "loss": 98.4398, |
| "step": 660 |
| }, |
| { |
| "epoch": 0.07971470526536606, |
| "grad_norm": 107.3125, |
| "learning_rate": 7.964071856287424e-07, |
| "loss": 97.4769, |
| "step": 665 |
| }, |
| { |
| "epoch": 0.08031406395157181, |
| "grad_norm": 104.4375, |
| "learning_rate": 8.023952095808383e-07, |
| "loss": 98.9293, |
| "step": 670 |
| }, |
| { |
| "epoch": 0.08091342263777758, |
| "grad_norm": 106.0, |
| "learning_rate": 8.083832335329341e-07, |
| "loss": 98.9894, |
| "step": 675 |
| }, |
| { |
| "epoch": 0.08151278132398333, |
| "grad_norm": 108.125, |
| "learning_rate": 8.143712574850299e-07, |
| "loss": 99.3078, |
| "step": 680 |
| }, |
| { |
| "epoch": 0.0821121400101891, |
| "grad_norm": 108.0, |
| "learning_rate": 8.203592814371258e-07, |
| "loss": 99.7428, |
| "step": 685 |
| }, |
| { |
| "epoch": 0.08271149869639485, |
| "grad_norm": 107.75, |
| "learning_rate": 8.263473053892215e-07, |
| "loss": 100.1608, |
| "step": 690 |
| }, |
| { |
| "epoch": 0.08331085738260062, |
| "grad_norm": 109.25, |
| "learning_rate": 8.323353293413173e-07, |
| "loss": 98.0089, |
| "step": 695 |
| }, |
| { |
| "epoch": 0.08391021606880637, |
| "grad_norm": 109.875, |
| "learning_rate": 8.38323353293413e-07, |
| "loss": 99.6416, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.08450957475501214, |
| "grad_norm": 106.5, |
| "learning_rate": 8.443113772455089e-07, |
| "loss": 98.5848, |
| "step": 705 |
| }, |
| { |
| "epoch": 0.0851089334412179, |
| "grad_norm": 106.625, |
| "learning_rate": 8.502994011976048e-07, |
| "loss": 99.8258, |
| "step": 710 |
| }, |
| { |
| "epoch": 0.08570829212742366, |
| "grad_norm": 106.75, |
| "learning_rate": 8.562874251497006e-07, |
| "loss": 98.2121, |
| "step": 715 |
| }, |
| { |
| "epoch": 0.08630765081362941, |
| "grad_norm": 108.75, |
| "learning_rate": 8.622754491017964e-07, |
| "loss": 98.832, |
| "step": 720 |
| }, |
| { |
| "epoch": 0.08690700949983518, |
| "grad_norm": 105.1875, |
| "learning_rate": 8.682634730538921e-07, |
| "loss": 99.0368, |
| "step": 725 |
| }, |
| { |
| "epoch": 0.08750636818604093, |
| "grad_norm": 106.3125, |
| "learning_rate": 8.74251497005988e-07, |
| "loss": 99.2618, |
| "step": 730 |
| }, |
| { |
| "epoch": 0.0881057268722467, |
| "grad_norm": 108.875, |
| "learning_rate": 8.802395209580839e-07, |
| "loss": 100.3378, |
| "step": 735 |
| }, |
| { |
| "epoch": 0.08870508555845245, |
| "grad_norm": 104.1875, |
| "learning_rate": 8.862275449101796e-07, |
| "loss": 99.1748, |
| "step": 740 |
| }, |
| { |
| "epoch": 0.08930444424465822, |
| "grad_norm": 109.375, |
| "learning_rate": 8.922155688622755e-07, |
| "loss": 100.3106, |
| "step": 745 |
| }, |
| { |
| "epoch": 0.08990380293086397, |
| "grad_norm": 106.1875, |
| "learning_rate": 8.982035928143712e-07, |
| "loss": 97.4286, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.09050316161706974, |
| "grad_norm": 107.3125, |
| "learning_rate": 9.04191616766467e-07, |
| "loss": 99.5878, |
| "step": 755 |
| }, |
| { |
| "epoch": 0.0911025203032755, |
| "grad_norm": 105.375, |
| "learning_rate": 9.101796407185628e-07, |
| "loss": 99.2444, |
| "step": 760 |
| }, |
| { |
| "epoch": 0.09170187898948126, |
| "grad_norm": 109.9375, |
| "learning_rate": 9.161676646706587e-07, |
| "loss": 100.2878, |
| "step": 765 |
| }, |
| { |
| "epoch": 0.09230123767568701, |
| "grad_norm": 106.375, |
| "learning_rate": 9.221556886227545e-07, |
| "loss": 99.3472, |
| "step": 770 |
| }, |
| { |
| "epoch": 0.09290059636189278, |
| "grad_norm": 107.5, |
| "learning_rate": 9.281437125748503e-07, |
| "loss": 99.7771, |
| "step": 775 |
| }, |
| { |
| "epoch": 0.09349995504809853, |
| "grad_norm": 110.75, |
| "learning_rate": 9.341317365269461e-07, |
| "loss": 101.1653, |
| "step": 780 |
| }, |
| { |
| "epoch": 0.09409931373430429, |
| "grad_norm": 107.0625, |
| "learning_rate": 9.401197604790418e-07, |
| "loss": 98.7351, |
| "step": 785 |
| }, |
| { |
| "epoch": 0.09469867242051005, |
| "grad_norm": 109.8125, |
| "learning_rate": 9.461077844311377e-07, |
| "loss": 98.4055, |
| "step": 790 |
| }, |
| { |
| "epoch": 0.09529803110671581, |
| "grad_norm": 105.6875, |
| "learning_rate": 9.520958083832335e-07, |
| "loss": 100.6708, |
| "step": 795 |
| }, |
| { |
| "epoch": 0.09589738979292158, |
| "grad_norm": 107.9375, |
| "learning_rate": 9.580838323353293e-07, |
| "loss": 99.4253, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.09649674847912733, |
| "grad_norm": 102.1875, |
| "learning_rate": 9.640718562874252e-07, |
| "loss": 99.4844, |
| "step": 805 |
| }, |
| { |
| "epoch": 0.0970961071653331, |
| "grad_norm": 104.3125, |
| "learning_rate": 9.70059880239521e-07, |
| "loss": 100.4691, |
| "step": 810 |
| }, |
| { |
| "epoch": 0.09769546585153885, |
| "grad_norm": 104.8125, |
| "learning_rate": 9.760479041916168e-07, |
| "loss": 98.4424, |
| "step": 815 |
| }, |
| { |
| "epoch": 0.09829482453774462, |
| "grad_norm": 106.375, |
| "learning_rate": 9.820359281437125e-07, |
| "loss": 99.2889, |
| "step": 820 |
| }, |
| { |
| "epoch": 0.09889418322395037, |
| "grad_norm": 104.4375, |
| "learning_rate": 9.880239520958083e-07, |
| "loss": 98.5602, |
| "step": 825 |
| }, |
| { |
| "epoch": 0.09949354191015614, |
| "grad_norm": 104.0625, |
| "learning_rate": 9.940119760479041e-07, |
| "loss": 98.6925, |
| "step": 830 |
| }, |
| { |
| "epoch": 0.10009290059636189, |
| "grad_norm": 106.0, |
| "learning_rate": 1e-06, |
| "loss": 99.9813, |
| "step": 835 |
| }, |
| { |
| "epoch": 0.10069225928256766, |
| "grad_norm": 104.3125, |
| "learning_rate": 9.993339549753564e-07, |
| "loss": 100.2518, |
| "step": 840 |
| }, |
| { |
| "epoch": 0.10129161796877341, |
| "grad_norm": 107.5625, |
| "learning_rate": 9.986679099507126e-07, |
| "loss": 101.2464, |
| "step": 845 |
| }, |
| { |
| "epoch": 0.10189097665497918, |
| "grad_norm": 110.25, |
| "learning_rate": 9.98001864926069e-07, |
| "loss": 98.2911, |
| "step": 850 |
| }, |
| { |
| "epoch": 0.10249033534118493, |
| "grad_norm": 104.4375, |
| "learning_rate": 9.973358199014254e-07, |
| "loss": 98.8816, |
| "step": 855 |
| }, |
| { |
| "epoch": 0.1030896940273907, |
| "grad_norm": 106.5625, |
| "learning_rate": 9.966697748767816e-07, |
| "loss": 100.1502, |
| "step": 860 |
| }, |
| { |
| "epoch": 0.10368905271359645, |
| "grad_norm": 104.1875, |
| "learning_rate": 9.96003729852138e-07, |
| "loss": 99.8585, |
| "step": 865 |
| }, |
| { |
| "epoch": 0.10428841139980222, |
| "grad_norm": 102.375, |
| "learning_rate": 9.953376848274942e-07, |
| "loss": 99.9457, |
| "step": 870 |
| }, |
| { |
| "epoch": 0.10488777008600797, |
| "grad_norm": 104.375, |
| "learning_rate": 9.946716398028506e-07, |
| "loss": 100.6051, |
| "step": 875 |
| }, |
| { |
| "epoch": 0.10548712877221374, |
| "grad_norm": 105.75, |
| "learning_rate": 9.94005594778207e-07, |
| "loss": 98.5651, |
| "step": 880 |
| }, |
| { |
| "epoch": 0.10608648745841949, |
| "grad_norm": 105.5625, |
| "learning_rate": 9.933395497535634e-07, |
| "loss": 99.5896, |
| "step": 885 |
| }, |
| { |
| "epoch": 0.10668584614462526, |
| "grad_norm": 104.1875, |
| "learning_rate": 9.926735047289196e-07, |
| "loss": 101.2675, |
| "step": 890 |
| }, |
| { |
| "epoch": 0.10728520483083101, |
| "grad_norm": 110.75, |
| "learning_rate": 9.92007459704276e-07, |
| "loss": 100.4223, |
| "step": 895 |
| }, |
| { |
| "epoch": 0.10788456351703678, |
| "grad_norm": 107.4375, |
| "learning_rate": 9.913414146796324e-07, |
| "loss": 99.0593, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.10848392220324253, |
| "grad_norm": 107.625, |
| "learning_rate": 9.906753696549886e-07, |
| "loss": 99.1331, |
| "step": 905 |
| }, |
| { |
| "epoch": 0.10908328088944828, |
| "grad_norm": 109.25, |
| "learning_rate": 9.90009324630345e-07, |
| "loss": 98.8885, |
| "step": 910 |
| }, |
| { |
| "epoch": 0.10968263957565405, |
| "grad_norm": 102.875, |
| "learning_rate": 9.893432796057014e-07, |
| "loss": 99.1233, |
| "step": 915 |
| }, |
| { |
| "epoch": 0.1102819982618598, |
| "grad_norm": 109.5, |
| "learning_rate": 9.886772345810576e-07, |
| "loss": 98.3095, |
| "step": 920 |
| }, |
| { |
| "epoch": 0.11088135694806557, |
| "grad_norm": 105.5, |
| "learning_rate": 9.88011189556414e-07, |
| "loss": 99.468, |
| "step": 925 |
| }, |
| { |
| "epoch": 0.11148071563427132, |
| "grad_norm": 100.5625, |
| "learning_rate": 9.873451445317704e-07, |
| "loss": 99.1593, |
| "step": 930 |
| }, |
| { |
| "epoch": 0.11208007432047709, |
| "grad_norm": 107.6875, |
| "learning_rate": 9.866790995071268e-07, |
| "loss": 100.0329, |
| "step": 935 |
| }, |
| { |
| "epoch": 0.11267943300668284, |
| "grad_norm": 106.125, |
| "learning_rate": 9.86013054482483e-07, |
| "loss": 98.3336, |
| "step": 940 |
| }, |
| { |
| "epoch": 0.11327879169288861, |
| "grad_norm": 106.0625, |
| "learning_rate": 9.853470094578394e-07, |
| "loss": 100.0068, |
| "step": 945 |
| }, |
| { |
| "epoch": 0.11387815037909436, |
| "grad_norm": 101.0625, |
| "learning_rate": 9.846809644331956e-07, |
| "loss": 97.691, |
| "step": 950 |
| }, |
| { |
| "epoch": 0.11447750906530013, |
| "grad_norm": 103.5, |
| "learning_rate": 9.84014919408552e-07, |
| "loss": 98.8459, |
| "step": 955 |
| }, |
| { |
| "epoch": 0.11507686775150588, |
| "grad_norm": 106.125, |
| "learning_rate": 9.833488743839084e-07, |
| "loss": 99.4046, |
| "step": 960 |
| }, |
| { |
| "epoch": 0.11567622643771165, |
| "grad_norm": 103.4375, |
| "learning_rate": 9.826828293592646e-07, |
| "loss": 97.2283, |
| "step": 965 |
| }, |
| { |
| "epoch": 0.1162755851239174, |
| "grad_norm": 106.3125, |
| "learning_rate": 9.82016784334621e-07, |
| "loss": 99.0077, |
| "step": 970 |
| }, |
| { |
| "epoch": 0.11687494381012317, |
| "grad_norm": 108.375, |
| "learning_rate": 9.813507393099774e-07, |
| "loss": 99.1617, |
| "step": 975 |
| }, |
| { |
| "epoch": 0.11747430249632893, |
| "grad_norm": 107.125, |
| "learning_rate": 9.806846942853336e-07, |
| "loss": 99.5582, |
| "step": 980 |
| }, |
| { |
| "epoch": 0.11807366118253469, |
| "grad_norm": 108.0625, |
| "learning_rate": 9.8001864926069e-07, |
| "loss": 99.7954, |
| "step": 985 |
| }, |
| { |
| "epoch": 0.11867301986874045, |
| "grad_norm": 108.0625, |
| "learning_rate": 9.793526042360462e-07, |
| "loss": 99.1493, |
| "step": 990 |
| }, |
| { |
| "epoch": 0.11927237855494621, |
| "grad_norm": 104.5625, |
| "learning_rate": 9.786865592114026e-07, |
| "loss": 97.6637, |
| "step": 995 |
| }, |
| { |
| "epoch": 0.11987173724115197, |
| "grad_norm": 107.375, |
| "learning_rate": 9.78020514186759e-07, |
| "loss": 97.6329, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.11987173724115197, |
| "eval_loss": 3.089021682739258, |
| "eval_runtime": 401.7854, |
| "eval_samples_per_second": 1119.013, |
| "eval_steps_per_second": 34.971, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.12047109592735773, |
| "grad_norm": 105.375, |
| "learning_rate": 9.773544691621152e-07, |
| "loss": 98.9716, |
| "step": 1005 |
| }, |
| { |
| "epoch": 0.12107045461356349, |
| "grad_norm": 106.75, |
| "learning_rate": 9.766884241374716e-07, |
| "loss": 99.2064, |
| "step": 1010 |
| }, |
| { |
| "epoch": 0.12166981329976925, |
| "grad_norm": 104.8125, |
| "learning_rate": 9.76022379112828e-07, |
| "loss": 100.1851, |
| "step": 1015 |
| }, |
| { |
| "epoch": 0.122269171985975, |
| "grad_norm": 103.5625, |
| "learning_rate": 9.753563340881844e-07, |
| "loss": 97.3658, |
| "step": 1020 |
| }, |
| { |
| "epoch": 0.12286853067218077, |
| "grad_norm": 106.75, |
| "learning_rate": 9.746902890635406e-07, |
| "loss": 99.1893, |
| "step": 1025 |
| }, |
| { |
| "epoch": 0.12346788935838653, |
| "grad_norm": 107.0625, |
| "learning_rate": 9.74024244038897e-07, |
| "loss": 98.0062, |
| "step": 1030 |
| }, |
| { |
| "epoch": 0.1240672480445923, |
| "grad_norm": 105.9375, |
| "learning_rate": 9.733581990142534e-07, |
| "loss": 98.6342, |
| "step": 1035 |
| }, |
| { |
| "epoch": 0.12466660673079805, |
| "grad_norm": 105.1875, |
| "learning_rate": 9.726921539896096e-07, |
| "loss": 98.5687, |
| "step": 1040 |
| }, |
| { |
| "epoch": 0.1252659654170038, |
| "grad_norm": 100.75, |
| "learning_rate": 9.72026108964966e-07, |
| "loss": 96.8965, |
| "step": 1045 |
| }, |
| { |
| "epoch": 0.12586532410320955, |
| "grad_norm": 105.0625, |
| "learning_rate": 9.713600639403224e-07, |
| "loss": 97.2172, |
| "step": 1050 |
| }, |
| { |
| "epoch": 0.12646468278941533, |
| "grad_norm": 105.625, |
| "learning_rate": 9.706940189156786e-07, |
| "loss": 98.1943, |
| "step": 1055 |
| }, |
| { |
| "epoch": 0.1270640414756211, |
| "grad_norm": 106.5, |
| "learning_rate": 9.70027973891035e-07, |
| "loss": 97.6631, |
| "step": 1060 |
| }, |
| { |
| "epoch": 0.12766340016182684, |
| "grad_norm": 106.5, |
| "learning_rate": 9.693619288663914e-07, |
| "loss": 96.5269, |
| "step": 1065 |
| }, |
| { |
| "epoch": 0.1282627588480326, |
| "grad_norm": 106.1875, |
| "learning_rate": 9.686958838417476e-07, |
| "loss": 99.6469, |
| "step": 1070 |
| }, |
| { |
| "epoch": 0.12886211753423837, |
| "grad_norm": 108.75, |
| "learning_rate": 9.68029838817104e-07, |
| "loss": 97.0663, |
| "step": 1075 |
| }, |
| { |
| "epoch": 0.12946147622044413, |
| "grad_norm": 101.4375, |
| "learning_rate": 9.673637937924604e-07, |
| "loss": 97.5067, |
| "step": 1080 |
| }, |
| { |
| "epoch": 0.13006083490664988, |
| "grad_norm": 108.375, |
| "learning_rate": 9.666977487678166e-07, |
| "loss": 98.6691, |
| "step": 1085 |
| }, |
| { |
| "epoch": 0.13066019359285563, |
| "grad_norm": 108.0, |
| "learning_rate": 9.66031703743173e-07, |
| "loss": 98.2915, |
| "step": 1090 |
| }, |
| { |
| "epoch": 0.13125955227906141, |
| "grad_norm": 103.1875, |
| "learning_rate": 9.653656587185294e-07, |
| "loss": 98.3848, |
| "step": 1095 |
| }, |
| { |
| "epoch": 0.13185891096526717, |
| "grad_norm": 104.1875, |
| "learning_rate": 9.646996136938856e-07, |
| "loss": 98.996, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.13245826965147292, |
| "grad_norm": 104.125, |
| "learning_rate": 9.64033568669242e-07, |
| "loss": 99.8995, |
| "step": 1105 |
| }, |
| { |
| "epoch": 0.13305762833767867, |
| "grad_norm": 106.1875, |
| "learning_rate": 9.633675236445982e-07, |
| "loss": 98.3694, |
| "step": 1110 |
| }, |
| { |
| "epoch": 0.13365698702388445, |
| "grad_norm": 103.125, |
| "learning_rate": 9.627014786199546e-07, |
| "loss": 97.7132, |
| "step": 1115 |
| }, |
| { |
| "epoch": 0.1342563457100902, |
| "grad_norm": 104.125, |
| "learning_rate": 9.62035433595311e-07, |
| "loss": 96.71, |
| "step": 1120 |
| }, |
| { |
| "epoch": 0.13485570439629596, |
| "grad_norm": 103.625, |
| "learning_rate": 9.613693885706672e-07, |
| "loss": 98.2567, |
| "step": 1125 |
| }, |
| { |
| "epoch": 0.13545506308250171, |
| "grad_norm": 103.625, |
| "learning_rate": 9.607033435460236e-07, |
| "loss": 97.9151, |
| "step": 1130 |
| }, |
| { |
| "epoch": 0.1360544217687075, |
| "grad_norm": 103.0, |
| "learning_rate": 9.6003729852138e-07, |
| "loss": 97.817, |
| "step": 1135 |
| }, |
| { |
| "epoch": 0.13665378045491325, |
| "grad_norm": 103.25, |
| "learning_rate": 9.593712534967362e-07, |
| "loss": 98.3279, |
| "step": 1140 |
| }, |
| { |
| "epoch": 0.137253139141119, |
| "grad_norm": 104.75, |
| "learning_rate": 9.587052084720926e-07, |
| "loss": 97.5354, |
| "step": 1145 |
| }, |
| { |
| "epoch": 0.13785249782732475, |
| "grad_norm": 103.875, |
| "learning_rate": 9.58039163447449e-07, |
| "loss": 97.3573, |
| "step": 1150 |
| }, |
| { |
| "epoch": 0.13845185651353054, |
| "grad_norm": 105.5, |
| "learning_rate": 9.573731184228054e-07, |
| "loss": 97.9208, |
| "step": 1155 |
| }, |
| { |
| "epoch": 0.1390512151997363, |
| "grad_norm": 106.5, |
| "learning_rate": 9.567070733981616e-07, |
| "loss": 96.5023, |
| "step": 1160 |
| }, |
| { |
| "epoch": 0.13965057388594204, |
| "grad_norm": 106.5625, |
| "learning_rate": 9.56041028373518e-07, |
| "loss": 96.8592, |
| "step": 1165 |
| }, |
| { |
| "epoch": 0.1402499325721478, |
| "grad_norm": 106.75, |
| "learning_rate": 9.553749833488744e-07, |
| "loss": 97.1946, |
| "step": 1170 |
| }, |
| { |
| "epoch": 0.14084929125835355, |
| "grad_norm": 106.875, |
| "learning_rate": 9.547089383242306e-07, |
| "loss": 97.9764, |
| "step": 1175 |
| }, |
| { |
| "epoch": 0.14144864994455933, |
| "grad_norm": 106.0, |
| "learning_rate": 9.54042893299587e-07, |
| "loss": 98.4564, |
| "step": 1180 |
| }, |
| { |
| "epoch": 0.14204800863076508, |
| "grad_norm": 105.5625, |
| "learning_rate": 9.533768482749433e-07, |
| "loss": 96.7205, |
| "step": 1185 |
| }, |
| { |
| "epoch": 0.14264736731697084, |
| "grad_norm": 105.6875, |
| "learning_rate": 9.527108032502996e-07, |
| "loss": 98.374, |
| "step": 1190 |
| }, |
| { |
| "epoch": 0.1432467260031766, |
| "grad_norm": 106.5625, |
| "learning_rate": 9.52044758225656e-07, |
| "loss": 97.8351, |
| "step": 1195 |
| }, |
| { |
| "epoch": 0.14384608468938237, |
| "grad_norm": 104.1875, |
| "learning_rate": 9.513787132010123e-07, |
| "loss": 97.8296, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.14444544337558812, |
| "grad_norm": 103.5625, |
| "learning_rate": 9.507126681763686e-07, |
| "loss": 98.0513, |
| "step": 1205 |
| }, |
| { |
| "epoch": 0.14504480206179388, |
| "grad_norm": 107.6875, |
| "learning_rate": 9.50046623151725e-07, |
| "loss": 96.9166, |
| "step": 1210 |
| }, |
| { |
| "epoch": 0.14564416074799963, |
| "grad_norm": 105.375, |
| "learning_rate": 9.493805781270814e-07, |
| "loss": 98.0969, |
| "step": 1215 |
| }, |
| { |
| "epoch": 0.1462435194342054, |
| "grad_norm": 105.0, |
| "learning_rate": 9.487145331024376e-07, |
| "loss": 96.9892, |
| "step": 1220 |
| }, |
| { |
| "epoch": 0.14684287812041116, |
| "grad_norm": 102.625, |
| "learning_rate": 9.48048488077794e-07, |
| "loss": 97.9786, |
| "step": 1225 |
| }, |
| { |
| "epoch": 0.14744223680661692, |
| "grad_norm": 101.625, |
| "learning_rate": 9.473824430531504e-07, |
| "loss": 96.3052, |
| "step": 1230 |
| }, |
| { |
| "epoch": 0.14804159549282267, |
| "grad_norm": 107.8125, |
| "learning_rate": 9.467163980285066e-07, |
| "loss": 97.7565, |
| "step": 1235 |
| }, |
| { |
| "epoch": 0.14864095417902845, |
| "grad_norm": 107.5, |
| "learning_rate": 9.46050353003863e-07, |
| "loss": 96.6391, |
| "step": 1240 |
| }, |
| { |
| "epoch": 0.1492403128652342, |
| "grad_norm": 103.375, |
| "learning_rate": 9.453843079792193e-07, |
| "loss": 95.5644, |
| "step": 1245 |
| }, |
| { |
| "epoch": 0.14983967155143996, |
| "grad_norm": 106.125, |
| "learning_rate": 9.447182629545756e-07, |
| "loss": 95.803, |
| "step": 1250 |
| }, |
| { |
| "epoch": 0.1504390302376457, |
| "grad_norm": 105.875, |
| "learning_rate": 9.44052217929932e-07, |
| "loss": 96.3256, |
| "step": 1255 |
| }, |
| { |
| "epoch": 0.1510383889238515, |
| "grad_norm": 103.25, |
| "learning_rate": 9.433861729052883e-07, |
| "loss": 98.1857, |
| "step": 1260 |
| }, |
| { |
| "epoch": 0.15163774761005724, |
| "grad_norm": 104.5625, |
| "learning_rate": 9.427201278806447e-07, |
| "loss": 97.9819, |
| "step": 1265 |
| }, |
| { |
| "epoch": 0.152237106296263, |
| "grad_norm": 102.6875, |
| "learning_rate": 9.42054082856001e-07, |
| "loss": 96.2453, |
| "step": 1270 |
| }, |
| { |
| "epoch": 0.15283646498246875, |
| "grad_norm": 102.75, |
| "learning_rate": 9.413880378313573e-07, |
| "loss": 96.0352, |
| "step": 1275 |
| }, |
| { |
| "epoch": 0.15343582366867453, |
| "grad_norm": 104.125, |
| "learning_rate": 9.407219928067138e-07, |
| "loss": 97.2261, |
| "step": 1280 |
| }, |
| { |
| "epoch": 0.15403518235488028, |
| "grad_norm": 101.5625, |
| "learning_rate": 9.400559477820699e-07, |
| "loss": 95.9944, |
| "step": 1285 |
| }, |
| { |
| "epoch": 0.15463454104108604, |
| "grad_norm": 105.5, |
| "learning_rate": 9.393899027574263e-07, |
| "loss": 96.8925, |
| "step": 1290 |
| }, |
| { |
| "epoch": 0.1552338997272918, |
| "grad_norm": 102.0, |
| "learning_rate": 9.387238577327828e-07, |
| "loss": 97.0403, |
| "step": 1295 |
| }, |
| { |
| "epoch": 0.15583325841349754, |
| "grad_norm": 102.625, |
| "learning_rate": 9.38057812708139e-07, |
| "loss": 95.9254, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.15643261709970332, |
| "grad_norm": 106.0625, |
| "learning_rate": 9.373917676834954e-07, |
| "loss": 97.3174, |
| "step": 1305 |
| }, |
| { |
| "epoch": 0.15703197578590908, |
| "grad_norm": 103.5, |
| "learning_rate": 9.367257226588518e-07, |
| "loss": 95.8913, |
| "step": 1310 |
| }, |
| { |
| "epoch": 0.15763133447211483, |
| "grad_norm": 104.375, |
| "learning_rate": 9.360596776342081e-07, |
| "loss": 96.4431, |
| "step": 1315 |
| }, |
| { |
| "epoch": 0.15823069315832058, |
| "grad_norm": 105.1875, |
| "learning_rate": 9.353936326095644e-07, |
| "loss": 97.7829, |
| "step": 1320 |
| }, |
| { |
| "epoch": 0.15883005184452637, |
| "grad_norm": 103.9375, |
| "learning_rate": 9.347275875849207e-07, |
| "loss": 97.3946, |
| "step": 1325 |
| }, |
| { |
| "epoch": 0.15942941053073212, |
| "grad_norm": 103.5625, |
| "learning_rate": 9.340615425602771e-07, |
| "loss": 96.9318, |
| "step": 1330 |
| }, |
| { |
| "epoch": 0.16002876921693787, |
| "grad_norm": 103.6875, |
| "learning_rate": 9.333954975356334e-07, |
| "loss": 97.0651, |
| "step": 1335 |
| }, |
| { |
| "epoch": 0.16062812790314362, |
| "grad_norm": 103.9375, |
| "learning_rate": 9.327294525109897e-07, |
| "loss": 96.876, |
| "step": 1340 |
| }, |
| { |
| "epoch": 0.1612274865893494, |
| "grad_norm": 103.8125, |
| "learning_rate": 9.320634074863461e-07, |
| "loss": 96.045, |
| "step": 1345 |
| }, |
| { |
| "epoch": 0.16182684527555516, |
| "grad_norm": 106.375, |
| "learning_rate": 9.313973624617025e-07, |
| "loss": 96.8118, |
| "step": 1350 |
| }, |
| { |
| "epoch": 0.1624262039617609, |
| "grad_norm": 104.9375, |
| "learning_rate": 9.307313174370587e-07, |
| "loss": 96.7171, |
| "step": 1355 |
| }, |
| { |
| "epoch": 0.16302556264796667, |
| "grad_norm": 104.125, |
| "learning_rate": 9.300652724124151e-07, |
| "loss": 94.9321, |
| "step": 1360 |
| }, |
| { |
| "epoch": 0.16362492133417245, |
| "grad_norm": 103.8125, |
| "learning_rate": 9.293992273877714e-07, |
| "loss": 95.2336, |
| "step": 1365 |
| }, |
| { |
| "epoch": 0.1642242800203782, |
| "grad_norm": 106.9375, |
| "learning_rate": 9.287331823631277e-07, |
| "loss": 96.755, |
| "step": 1370 |
| }, |
| { |
| "epoch": 0.16482363870658395, |
| "grad_norm": 103.5, |
| "learning_rate": 9.280671373384841e-07, |
| "loss": 95.191, |
| "step": 1375 |
| }, |
| { |
| "epoch": 0.1654229973927897, |
| "grad_norm": 103.5625, |
| "learning_rate": 9.274010923138404e-07, |
| "loss": 96.1609, |
| "step": 1380 |
| }, |
| { |
| "epoch": 0.1660223560789955, |
| "grad_norm": 105.125, |
| "learning_rate": 9.267350472891967e-07, |
| "loss": 95.8075, |
| "step": 1385 |
| }, |
| { |
| "epoch": 0.16662171476520124, |
| "grad_norm": 101.75, |
| "learning_rate": 9.260690022645531e-07, |
| "loss": 96.2969, |
| "step": 1390 |
| }, |
| { |
| "epoch": 0.167221073451407, |
| "grad_norm": 100.75, |
| "learning_rate": 9.254029572399094e-07, |
| "loss": 96.6542, |
| "step": 1395 |
| }, |
| { |
| "epoch": 0.16782043213761275, |
| "grad_norm": 103.3125, |
| "learning_rate": 9.247369122152658e-07, |
| "loss": 96.97, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.16841979082381853, |
| "grad_norm": 101.6875, |
| "learning_rate": 9.24070867190622e-07, |
| "loss": 97.02, |
| "step": 1405 |
| }, |
| { |
| "epoch": 0.16901914951002428, |
| "grad_norm": 105.0, |
| "learning_rate": 9.234048221659784e-07, |
| "loss": 94.5289, |
| "step": 1410 |
| }, |
| { |
| "epoch": 0.16961850819623003, |
| "grad_norm": 106.5, |
| "learning_rate": 9.227387771413348e-07, |
| "loss": 95.6256, |
| "step": 1415 |
| }, |
| { |
| "epoch": 0.1702178668824358, |
| "grad_norm": 102.875, |
| "learning_rate": 9.22072732116691e-07, |
| "loss": 95.2031, |
| "step": 1420 |
| }, |
| { |
| "epoch": 0.17081722556864154, |
| "grad_norm": 98.4375, |
| "learning_rate": 9.214066870920474e-07, |
| "loss": 96.182, |
| "step": 1425 |
| }, |
| { |
| "epoch": 0.17141658425484732, |
| "grad_norm": 103.0, |
| "learning_rate": 9.207406420674038e-07, |
| "loss": 96.1099, |
| "step": 1430 |
| }, |
| { |
| "epoch": 0.17201594294105307, |
| "grad_norm": 104.75, |
| "learning_rate": 9.2007459704276e-07, |
| "loss": 96.3921, |
| "step": 1435 |
| }, |
| { |
| "epoch": 0.17261530162725883, |
| "grad_norm": 106.8125, |
| "learning_rate": 9.194085520181164e-07, |
| "loss": 95.0675, |
| "step": 1440 |
| }, |
| { |
| "epoch": 0.17321466031346458, |
| "grad_norm": 98.625, |
| "learning_rate": 9.187425069934727e-07, |
| "loss": 96.4779, |
| "step": 1445 |
| }, |
| { |
| "epoch": 0.17381401899967036, |
| "grad_norm": 104.0, |
| "learning_rate": 9.180764619688291e-07, |
| "loss": 96.3266, |
| "step": 1450 |
| }, |
| { |
| "epoch": 0.17441337768587611, |
| "grad_norm": 101.6875, |
| "learning_rate": 9.174104169441854e-07, |
| "loss": 95.1507, |
| "step": 1455 |
| }, |
| { |
| "epoch": 0.17501273637208187, |
| "grad_norm": 105.4375, |
| "learning_rate": 9.167443719195417e-07, |
| "loss": 96.3075, |
| "step": 1460 |
| }, |
| { |
| "epoch": 0.17561209505828762, |
| "grad_norm": 104.1875, |
| "learning_rate": 9.160783268948981e-07, |
| "loss": 95.6688, |
| "step": 1465 |
| }, |
| { |
| "epoch": 0.1762114537444934, |
| "grad_norm": 106.875, |
| "learning_rate": 9.154122818702544e-07, |
| "loss": 96.2328, |
| "step": 1470 |
| }, |
| { |
| "epoch": 0.17681081243069915, |
| "grad_norm": 102.9375, |
| "learning_rate": 9.147462368456107e-07, |
| "loss": 95.5972, |
| "step": 1475 |
| }, |
| { |
| "epoch": 0.1774101711169049, |
| "grad_norm": 102.4375, |
| "learning_rate": 9.140801918209671e-07, |
| "loss": 96.3281, |
| "step": 1480 |
| }, |
| { |
| "epoch": 0.17800952980311066, |
| "grad_norm": 106.3125, |
| "learning_rate": 9.134141467963233e-07, |
| "loss": 95.6763, |
| "step": 1485 |
| }, |
| { |
| "epoch": 0.17860888848931644, |
| "grad_norm": 106.25, |
| "learning_rate": 9.127481017716797e-07, |
| "loss": 96.2383, |
| "step": 1490 |
| }, |
| { |
| "epoch": 0.1792082471755222, |
| "grad_norm": 104.625, |
| "learning_rate": 9.120820567470361e-07, |
| "loss": 95.6165, |
| "step": 1495 |
| }, |
| { |
| "epoch": 0.17980760586172795, |
| "grad_norm": 108.25, |
| "learning_rate": 9.114160117223924e-07, |
| "loss": 94.2036, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.17980760586172795, |
| "eval_loss": 2.9848318099975586, |
| "eval_runtime": 403.3245, |
| "eval_samples_per_second": 1114.743, |
| "eval_steps_per_second": 34.838, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.1804069645479337, |
| "grad_norm": 106.5, |
| "learning_rate": 9.107499666977487e-07, |
| "loss": 95.92, |
| "step": 1505 |
| }, |
| { |
| "epoch": 0.18100632323413948, |
| "grad_norm": 105.5, |
| "learning_rate": 9.100839216731051e-07, |
| "loss": 96.1439, |
| "step": 1510 |
| }, |
| { |
| "epoch": 0.18160568192034524, |
| "grad_norm": 104.0, |
| "learning_rate": 9.094178766484614e-07, |
| "loss": 94.9684, |
| "step": 1515 |
| }, |
| { |
| "epoch": 0.182205040606551, |
| "grad_norm": 103.8125, |
| "learning_rate": 9.087518316238177e-07, |
| "loss": 95.09, |
| "step": 1520 |
| }, |
| { |
| "epoch": 0.18280439929275674, |
| "grad_norm": 104.6875, |
| "learning_rate": 9.08085786599174e-07, |
| "loss": 93.9746, |
| "step": 1525 |
| }, |
| { |
| "epoch": 0.18340375797896252, |
| "grad_norm": 105.5625, |
| "learning_rate": 9.074197415745304e-07, |
| "loss": 93.8851, |
| "step": 1530 |
| }, |
| { |
| "epoch": 0.18400311666516828, |
| "grad_norm": 104.125, |
| "learning_rate": 9.067536965498868e-07, |
| "loss": 95.6037, |
| "step": 1535 |
| }, |
| { |
| "epoch": 0.18460247535137403, |
| "grad_norm": 104.75, |
| "learning_rate": 9.06087651525243e-07, |
| "loss": 94.7618, |
| "step": 1540 |
| }, |
| { |
| "epoch": 0.18520183403757978, |
| "grad_norm": 103.0, |
| "learning_rate": 9.054216065005994e-07, |
| "loss": 94.8233, |
| "step": 1545 |
| }, |
| { |
| "epoch": 0.18580119272378556, |
| "grad_norm": 105.375, |
| "learning_rate": 9.047555614759558e-07, |
| "loss": 95.4594, |
| "step": 1550 |
| }, |
| { |
| "epoch": 0.18640055140999132, |
| "grad_norm": 105.75, |
| "learning_rate": 9.04089516451312e-07, |
| "loss": 94.7827, |
| "step": 1555 |
| }, |
| { |
| "epoch": 0.18699991009619707, |
| "grad_norm": 108.875, |
| "learning_rate": 9.034234714266684e-07, |
| "loss": 95.8091, |
| "step": 1560 |
| }, |
| { |
| "epoch": 0.18759926878240282, |
| "grad_norm": 105.9375, |
| "learning_rate": 9.027574264020248e-07, |
| "loss": 94.6553, |
| "step": 1565 |
| }, |
| { |
| "epoch": 0.18819862746860858, |
| "grad_norm": 103.5625, |
| "learning_rate": 9.02091381377381e-07, |
| "loss": 93.9361, |
| "step": 1570 |
| }, |
| { |
| "epoch": 0.18879798615481436, |
| "grad_norm": 103.5625, |
| "learning_rate": 9.014253363527374e-07, |
| "loss": 95.9433, |
| "step": 1575 |
| }, |
| { |
| "epoch": 0.1893973448410201, |
| "grad_norm": 103.875, |
| "learning_rate": 9.007592913280937e-07, |
| "loss": 95.7227, |
| "step": 1580 |
| }, |
| { |
| "epoch": 0.18999670352722586, |
| "grad_norm": 106.375, |
| "learning_rate": 9.000932463034501e-07, |
| "loss": 94.9062, |
| "step": 1585 |
| }, |
| { |
| "epoch": 0.19059606221343162, |
| "grad_norm": 105.1875, |
| "learning_rate": 8.994272012788064e-07, |
| "loss": 96.6315, |
| "step": 1590 |
| }, |
| { |
| "epoch": 0.1911954208996374, |
| "grad_norm": 102.4375, |
| "learning_rate": 8.987611562541627e-07, |
| "loss": 96.9965, |
| "step": 1595 |
| }, |
| { |
| "epoch": 0.19179477958584315, |
| "grad_norm": 101.8125, |
| "learning_rate": 8.980951112295191e-07, |
| "loss": 95.0741, |
| "step": 1600 |
| }, |
| { |
| "epoch": 0.1923941382720489, |
| "grad_norm": 102.3125, |
| "learning_rate": 8.974290662048754e-07, |
| "loss": 94.5174, |
| "step": 1605 |
| }, |
| { |
| "epoch": 0.19299349695825466, |
| "grad_norm": 103.8125, |
| "learning_rate": 8.967630211802317e-07, |
| "loss": 94.2278, |
| "step": 1610 |
| }, |
| { |
| "epoch": 0.19359285564446044, |
| "grad_norm": 105.5625, |
| "learning_rate": 8.960969761555881e-07, |
| "loss": 94.2003, |
| "step": 1615 |
| }, |
| { |
| "epoch": 0.1941922143306662, |
| "grad_norm": 105.5625, |
| "learning_rate": 8.954309311309444e-07, |
| "loss": 94.3427, |
| "step": 1620 |
| }, |
| { |
| "epoch": 0.19479157301687194, |
| "grad_norm": 107.75, |
| "learning_rate": 8.947648861063007e-07, |
| "loss": 94.4432, |
| "step": 1625 |
| }, |
| { |
| "epoch": 0.1953909317030777, |
| "grad_norm": 106.5, |
| "learning_rate": 8.940988410816571e-07, |
| "loss": 95.272, |
| "step": 1630 |
| }, |
| { |
| "epoch": 0.19599029038928348, |
| "grad_norm": 102.375, |
| "learning_rate": 8.934327960570134e-07, |
| "loss": 94.1873, |
| "step": 1635 |
| }, |
| { |
| "epoch": 0.19658964907548923, |
| "grad_norm": 103.375, |
| "learning_rate": 8.927667510323697e-07, |
| "loss": 94.2735, |
| "step": 1640 |
| }, |
| { |
| "epoch": 0.19718900776169498, |
| "grad_norm": 102.6875, |
| "learning_rate": 8.921007060077261e-07, |
| "loss": 94.2252, |
| "step": 1645 |
| }, |
| { |
| "epoch": 0.19778836644790074, |
| "grad_norm": 106.75, |
| "learning_rate": 8.914346609830824e-07, |
| "loss": 94.3895, |
| "step": 1650 |
| }, |
| { |
| "epoch": 0.19838772513410652, |
| "grad_norm": 103.1875, |
| "learning_rate": 8.907686159584387e-07, |
| "loss": 95.1937, |
| "step": 1655 |
| }, |
| { |
| "epoch": 0.19898708382031227, |
| "grad_norm": 102.4375, |
| "learning_rate": 8.90102570933795e-07, |
| "loss": 94.2603, |
| "step": 1660 |
| }, |
| { |
| "epoch": 0.19958644250651802, |
| "grad_norm": 102.8125, |
| "learning_rate": 8.894365259091514e-07, |
| "loss": 94.7452, |
| "step": 1665 |
| }, |
| { |
| "epoch": 0.20018580119272378, |
| "grad_norm": 101.5, |
| "learning_rate": 8.887704808845078e-07, |
| "loss": 93.8857, |
| "step": 1670 |
| }, |
| { |
| "epoch": 0.20078515987892956, |
| "grad_norm": 102.9375, |
| "learning_rate": 8.88104435859864e-07, |
| "loss": 94.3981, |
| "step": 1675 |
| }, |
| { |
| "epoch": 0.2013845185651353, |
| "grad_norm": 105.375, |
| "learning_rate": 8.874383908352204e-07, |
| "loss": 94.9569, |
| "step": 1680 |
| }, |
| { |
| "epoch": 0.20198387725134107, |
| "grad_norm": 106.3125, |
| "learning_rate": 8.867723458105768e-07, |
| "loss": 93.7188, |
| "step": 1685 |
| }, |
| { |
| "epoch": 0.20258323593754682, |
| "grad_norm": 102.375, |
| "learning_rate": 8.86106300785933e-07, |
| "loss": 95.8018, |
| "step": 1690 |
| }, |
| { |
| "epoch": 0.20318259462375257, |
| "grad_norm": 103.875, |
| "learning_rate": 8.854402557612894e-07, |
| "loss": 94.5524, |
| "step": 1695 |
| }, |
| { |
| "epoch": 0.20378195330995835, |
| "grad_norm": 105.0, |
| "learning_rate": 8.847742107366457e-07, |
| "loss": 94.7519, |
| "step": 1700 |
| }, |
| { |
| "epoch": 0.2043813119961641, |
| "grad_norm": 103.9375, |
| "learning_rate": 8.84108165712002e-07, |
| "loss": 94.7115, |
| "step": 1705 |
| }, |
| { |
| "epoch": 0.20498067068236986, |
| "grad_norm": 100.4375, |
| "learning_rate": 8.834421206873584e-07, |
| "loss": 95.3129, |
| "step": 1710 |
| }, |
| { |
| "epoch": 0.2055800293685756, |
| "grad_norm": 103.25, |
| "learning_rate": 8.827760756627147e-07, |
| "loss": 94.4636, |
| "step": 1715 |
| }, |
| { |
| "epoch": 0.2061793880547814, |
| "grad_norm": 106.3125, |
| "learning_rate": 8.821100306380711e-07, |
| "loss": 94.4248, |
| "step": 1720 |
| }, |
| { |
| "epoch": 0.20677874674098715, |
| "grad_norm": 103.1875, |
| "learning_rate": 8.814439856134274e-07, |
| "loss": 93.4922, |
| "step": 1725 |
| }, |
| { |
| "epoch": 0.2073781054271929, |
| "grad_norm": 103.75, |
| "learning_rate": 8.807779405887837e-07, |
| "loss": 94.1299, |
| "step": 1730 |
| }, |
| { |
| "epoch": 0.20797746411339865, |
| "grad_norm": 105.75, |
| "learning_rate": 8.801118955641401e-07, |
| "loss": 92.188, |
| "step": 1735 |
| }, |
| { |
| "epoch": 0.20857682279960443, |
| "grad_norm": 105.9375, |
| "learning_rate": 8.794458505394963e-07, |
| "loss": 93.6489, |
| "step": 1740 |
| }, |
| { |
| "epoch": 0.2091761814858102, |
| "grad_norm": 104.875, |
| "learning_rate": 8.787798055148527e-07, |
| "loss": 94.0485, |
| "step": 1745 |
| }, |
| { |
| "epoch": 0.20977554017201594, |
| "grad_norm": 105.75, |
| "learning_rate": 8.781137604902091e-07, |
| "loss": 94.2215, |
| "step": 1750 |
| }, |
| { |
| "epoch": 0.2103748988582217, |
| "grad_norm": 103.4375, |
| "learning_rate": 8.774477154655654e-07, |
| "loss": 93.8523, |
| "step": 1755 |
| }, |
| { |
| "epoch": 0.21097425754442747, |
| "grad_norm": 99.9375, |
| "learning_rate": 8.767816704409217e-07, |
| "loss": 92.7818, |
| "step": 1760 |
| }, |
| { |
| "epoch": 0.21157361623063323, |
| "grad_norm": 104.375, |
| "learning_rate": 8.761156254162782e-07, |
| "loss": 93.718, |
| "step": 1765 |
| }, |
| { |
| "epoch": 0.21217297491683898, |
| "grad_norm": 103.375, |
| "learning_rate": 8.754495803916345e-07, |
| "loss": 93.3778, |
| "step": 1770 |
| }, |
| { |
| "epoch": 0.21277233360304473, |
| "grad_norm": 103.125, |
| "learning_rate": 8.747835353669907e-07, |
| "loss": 93.6893, |
| "step": 1775 |
| }, |
| { |
| "epoch": 0.21337169228925051, |
| "grad_norm": 105.0, |
| "learning_rate": 8.74117490342347e-07, |
| "loss": 93.1915, |
| "step": 1780 |
| }, |
| { |
| "epoch": 0.21397105097545627, |
| "grad_norm": 110.9375, |
| "learning_rate": 8.734514453177035e-07, |
| "loss": 93.6866, |
| "step": 1785 |
| }, |
| { |
| "epoch": 0.21457040966166202, |
| "grad_norm": 101.625, |
| "learning_rate": 8.727854002930598e-07, |
| "loss": 93.3116, |
| "step": 1790 |
| }, |
| { |
| "epoch": 0.21516976834786777, |
| "grad_norm": 105.4375, |
| "learning_rate": 8.72119355268416e-07, |
| "loss": 94.4551, |
| "step": 1795 |
| }, |
| { |
| "epoch": 0.21576912703407355, |
| "grad_norm": 101.75, |
| "learning_rate": 8.714533102437725e-07, |
| "loss": 93.7837, |
| "step": 1800 |
| }, |
| { |
| "epoch": 0.2163684857202793, |
| "grad_norm": 101.9375, |
| "learning_rate": 8.707872652191289e-07, |
| "loss": 94.558, |
| "step": 1805 |
| }, |
| { |
| "epoch": 0.21696784440648506, |
| "grad_norm": 103.6875, |
| "learning_rate": 8.701212201944851e-07, |
| "loss": 94.2175, |
| "step": 1810 |
| }, |
| { |
| "epoch": 0.21756720309269081, |
| "grad_norm": 106.125, |
| "learning_rate": 8.694551751698415e-07, |
| "loss": 94.034, |
| "step": 1815 |
| }, |
| { |
| "epoch": 0.21816656177889657, |
| "grad_norm": 104.0625, |
| "learning_rate": 8.687891301451978e-07, |
| "loss": 92.8112, |
| "step": 1820 |
| }, |
| { |
| "epoch": 0.21876592046510235, |
| "grad_norm": 102.75, |
| "learning_rate": 8.681230851205541e-07, |
| "loss": 92.7093, |
| "step": 1825 |
| }, |
| { |
| "epoch": 0.2193652791513081, |
| "grad_norm": 101.4375, |
| "learning_rate": 8.674570400959105e-07, |
| "loss": 93.7157, |
| "step": 1830 |
| }, |
| { |
| "epoch": 0.21996463783751385, |
| "grad_norm": 104.0, |
| "learning_rate": 8.667909950712668e-07, |
| "loss": 92.883, |
| "step": 1835 |
| }, |
| { |
| "epoch": 0.2205639965237196, |
| "grad_norm": 107.6875, |
| "learning_rate": 8.661249500466232e-07, |
| "loss": 93.1181, |
| "step": 1840 |
| }, |
| { |
| "epoch": 0.2211633552099254, |
| "grad_norm": 102.0625, |
| "learning_rate": 8.654589050219795e-07, |
| "loss": 92.2143, |
| "step": 1845 |
| }, |
| { |
| "epoch": 0.22176271389613114, |
| "grad_norm": 107.625, |
| "learning_rate": 8.647928599973358e-07, |
| "loss": 93.7465, |
| "step": 1850 |
| }, |
| { |
| "epoch": 0.2223620725823369, |
| "grad_norm": 102.375, |
| "learning_rate": 8.641268149726922e-07, |
| "loss": 92.4702, |
| "step": 1855 |
| }, |
| { |
| "epoch": 0.22296143126854265, |
| "grad_norm": 104.0625, |
| "learning_rate": 8.634607699480484e-07, |
| "loss": 94.137, |
| "step": 1860 |
| }, |
| { |
| "epoch": 0.22356078995474843, |
| "grad_norm": 102.8125, |
| "learning_rate": 8.627947249234048e-07, |
| "loss": 92.3921, |
| "step": 1865 |
| }, |
| { |
| "epoch": 0.22416014864095418, |
| "grad_norm": 101.875, |
| "learning_rate": 8.621286798987612e-07, |
| "loss": 93.8057, |
| "step": 1870 |
| }, |
| { |
| "epoch": 0.22475950732715994, |
| "grad_norm": 104.9375, |
| "learning_rate": 8.614626348741174e-07, |
| "loss": 92.852, |
| "step": 1875 |
| }, |
| { |
| "epoch": 0.2253588660133657, |
| "grad_norm": 103.25, |
| "learning_rate": 8.607965898494738e-07, |
| "loss": 92.9432, |
| "step": 1880 |
| }, |
| { |
| "epoch": 0.22595822469957147, |
| "grad_norm": 103.0625, |
| "learning_rate": 8.601305448248302e-07, |
| "loss": 91.3228, |
| "step": 1885 |
| }, |
| { |
| "epoch": 0.22655758338577722, |
| "grad_norm": 103.8125, |
| "learning_rate": 8.594644998001865e-07, |
| "loss": 92.478, |
| "step": 1890 |
| }, |
| { |
| "epoch": 0.22715694207198298, |
| "grad_norm": 104.5625, |
| "learning_rate": 8.587984547755428e-07, |
| "loss": 93.0927, |
| "step": 1895 |
| }, |
| { |
| "epoch": 0.22775630075818873, |
| "grad_norm": 102.25, |
| "learning_rate": 8.581324097508991e-07, |
| "loss": 94.3626, |
| "step": 1900 |
| }, |
| { |
| "epoch": 0.2283556594443945, |
| "grad_norm": 104.625, |
| "learning_rate": 8.574663647262555e-07, |
| "loss": 92.3862, |
| "step": 1905 |
| }, |
| { |
| "epoch": 0.22895501813060026, |
| "grad_norm": 102.8125, |
| "learning_rate": 8.568003197016118e-07, |
| "loss": 92.2181, |
| "step": 1910 |
| }, |
| { |
| "epoch": 0.22955437681680602, |
| "grad_norm": 107.5, |
| "learning_rate": 8.561342746769681e-07, |
| "loss": 94.641, |
| "step": 1915 |
| }, |
| { |
| "epoch": 0.23015373550301177, |
| "grad_norm": 100.625, |
| "learning_rate": 8.554682296523245e-07, |
| "loss": 92.6945, |
| "step": 1920 |
| }, |
| { |
| "epoch": 0.23075309418921755, |
| "grad_norm": 103.6875, |
| "learning_rate": 8.548021846276808e-07, |
| "loss": 94.1129, |
| "step": 1925 |
| }, |
| { |
| "epoch": 0.2313524528754233, |
| "grad_norm": 104.0, |
| "learning_rate": 8.541361396030371e-07, |
| "loss": 92.2068, |
| "step": 1930 |
| }, |
| { |
| "epoch": 0.23195181156162906, |
| "grad_norm": 104.875, |
| "learning_rate": 8.534700945783935e-07, |
| "loss": 93.1549, |
| "step": 1935 |
| }, |
| { |
| "epoch": 0.2325511702478348, |
| "grad_norm": 104.625, |
| "learning_rate": 8.528040495537499e-07, |
| "loss": 90.6698, |
| "step": 1940 |
| }, |
| { |
| "epoch": 0.23315052893404056, |
| "grad_norm": 103.25, |
| "learning_rate": 8.521380045291061e-07, |
| "loss": 93.3605, |
| "step": 1945 |
| }, |
| { |
| "epoch": 0.23374988762024634, |
| "grad_norm": 106.0625, |
| "learning_rate": 8.514719595044625e-07, |
| "loss": 93.7277, |
| "step": 1950 |
| }, |
| { |
| "epoch": 0.2343492463064521, |
| "grad_norm": 100.25, |
| "learning_rate": 8.508059144798188e-07, |
| "loss": 92.605, |
| "step": 1955 |
| }, |
| { |
| "epoch": 0.23494860499265785, |
| "grad_norm": 102.3125, |
| "learning_rate": 8.501398694551751e-07, |
| "loss": 93.0283, |
| "step": 1960 |
| }, |
| { |
| "epoch": 0.2355479636788636, |
| "grad_norm": 107.5625, |
| "learning_rate": 8.494738244305315e-07, |
| "loss": 91.955, |
| "step": 1965 |
| }, |
| { |
| "epoch": 0.23614732236506938, |
| "grad_norm": 99.375, |
| "learning_rate": 8.488077794058878e-07, |
| "loss": 92.1366, |
| "step": 1970 |
| }, |
| { |
| "epoch": 0.23674668105127514, |
| "grad_norm": 104.8125, |
| "learning_rate": 8.481417343812442e-07, |
| "loss": 91.6786, |
| "step": 1975 |
| }, |
| { |
| "epoch": 0.2373460397374809, |
| "grad_norm": 103.875, |
| "learning_rate": 8.474756893566005e-07, |
| "loss": 91.9743, |
| "step": 1980 |
| }, |
| { |
| "epoch": 0.23794539842368664, |
| "grad_norm": 103.25, |
| "learning_rate": 8.468096443319568e-07, |
| "loss": 91.3638, |
| "step": 1985 |
| }, |
| { |
| "epoch": 0.23854475710989242, |
| "grad_norm": 103.3125, |
| "learning_rate": 8.461435993073132e-07, |
| "loss": 92.2037, |
| "step": 1990 |
| }, |
| { |
| "epoch": 0.23914411579609818, |
| "grad_norm": 109.25, |
| "learning_rate": 8.454775542826694e-07, |
| "loss": 92.2242, |
| "step": 1995 |
| }, |
| { |
| "epoch": 0.23974347448230393, |
| "grad_norm": 103.4375, |
| "learning_rate": 8.448115092580258e-07, |
| "loss": 93.88, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.23974347448230393, |
| "eval_loss": 2.8944661617279053, |
| "eval_runtime": 404.8634, |
| "eval_samples_per_second": 1110.505, |
| "eval_steps_per_second": 34.706, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.24034283316850968, |
| "grad_norm": 101.5625, |
| "learning_rate": 8.441454642333822e-07, |
| "loss": 93.8037, |
| "step": 2005 |
| }, |
| { |
| "epoch": 0.24094219185471547, |
| "grad_norm": 103.5, |
| "learning_rate": 8.434794192087384e-07, |
| "loss": 92.7435, |
| "step": 2010 |
| }, |
| { |
| "epoch": 0.24154155054092122, |
| "grad_norm": 100.4375, |
| "learning_rate": 8.428133741840948e-07, |
| "loss": 93.3825, |
| "step": 2015 |
| }, |
| { |
| "epoch": 0.24214090922712697, |
| "grad_norm": 103.0, |
| "learning_rate": 8.421473291594512e-07, |
| "loss": 91.97, |
| "step": 2020 |
| }, |
| { |
| "epoch": 0.24274026791333272, |
| "grad_norm": 105.75, |
| "learning_rate": 8.414812841348075e-07, |
| "loss": 92.3056, |
| "step": 2025 |
| }, |
| { |
| "epoch": 0.2433396265995385, |
| "grad_norm": 101.5625, |
| "learning_rate": 8.408152391101638e-07, |
| "loss": 93.6722, |
| "step": 2030 |
| }, |
| { |
| "epoch": 0.24393898528574426, |
| "grad_norm": 104.3125, |
| "learning_rate": 8.401491940855201e-07, |
| "loss": 92.8126, |
| "step": 2035 |
| }, |
| { |
| "epoch": 0.24453834397195, |
| "grad_norm": 106.25, |
| "learning_rate": 8.394831490608765e-07, |
| "loss": 92.2455, |
| "step": 2040 |
| }, |
| { |
| "epoch": 0.24513770265815576, |
| "grad_norm": 103.8125, |
| "learning_rate": 8.388171040362328e-07, |
| "loss": 93.0326, |
| "step": 2045 |
| }, |
| { |
| "epoch": 0.24573706134436155, |
| "grad_norm": 102.4375, |
| "learning_rate": 8.381510590115891e-07, |
| "loss": 92.0434, |
| "step": 2050 |
| }, |
| { |
| "epoch": 0.2463364200305673, |
| "grad_norm": 103.9375, |
| "learning_rate": 8.374850139869455e-07, |
| "loss": 92.13, |
| "step": 2055 |
| }, |
| { |
| "epoch": 0.24693577871677305, |
| "grad_norm": 103.4375, |
| "learning_rate": 8.368189689623019e-07, |
| "loss": 92.0325, |
| "step": 2060 |
| }, |
| { |
| "epoch": 0.2475351374029788, |
| "grad_norm": 105.875, |
| "learning_rate": 8.361529239376581e-07, |
| "loss": 92.9597, |
| "step": 2065 |
| }, |
| { |
| "epoch": 0.2481344960891846, |
| "grad_norm": 102.25, |
| "learning_rate": 8.354868789130145e-07, |
| "loss": 91.3815, |
| "step": 2070 |
| }, |
| { |
| "epoch": 0.24873385477539034, |
| "grad_norm": 107.0, |
| "learning_rate": 8.348208338883708e-07, |
| "loss": 92.8083, |
| "step": 2075 |
| }, |
| { |
| "epoch": 0.2493332134615961, |
| "grad_norm": 106.0, |
| "learning_rate": 8.341547888637271e-07, |
| "loss": 90.4569, |
| "step": 2080 |
| }, |
| { |
| "epoch": 0.24993257214780185, |
| "grad_norm": 104.375, |
| "learning_rate": 8.334887438390835e-07, |
| "loss": 92.4561, |
| "step": 2085 |
| }, |
| { |
| "epoch": 0.2505319308340076, |
| "grad_norm": 105.125, |
| "learning_rate": 8.328226988144398e-07, |
| "loss": 93.1541, |
| "step": 2090 |
| }, |
| { |
| "epoch": 0.25113128952021335, |
| "grad_norm": 103.5625, |
| "learning_rate": 8.321566537897961e-07, |
| "loss": 92.0643, |
| "step": 2095 |
| }, |
| { |
| "epoch": 0.2517306482064191, |
| "grad_norm": 103.625, |
| "learning_rate": 8.314906087651525e-07, |
| "loss": 92.059, |
| "step": 2100 |
| }, |
| { |
| "epoch": 0.2523300068926249, |
| "grad_norm": 104.0, |
| "learning_rate": 8.308245637405088e-07, |
| "loss": 92.2131, |
| "step": 2105 |
| }, |
| { |
| "epoch": 0.25292936557883067, |
| "grad_norm": 101.0, |
| "learning_rate": 8.301585187158652e-07, |
| "loss": 92.5779, |
| "step": 2110 |
| }, |
| { |
| "epoch": 0.2535287242650364, |
| "grad_norm": 105.625, |
| "learning_rate": 8.294924736912214e-07, |
| "loss": 93.5033, |
| "step": 2115 |
| }, |
| { |
| "epoch": 0.2541280829512422, |
| "grad_norm": 105.0, |
| "learning_rate": 8.288264286665778e-07, |
| "loss": 92.5464, |
| "step": 2120 |
| }, |
| { |
| "epoch": 0.2547274416374479, |
| "grad_norm": 105.625, |
| "learning_rate": 8.281603836419342e-07, |
| "loss": 91.3447, |
| "step": 2125 |
| }, |
| { |
| "epoch": 0.2553268003236537, |
| "grad_norm": 105.5, |
| "learning_rate": 8.274943386172904e-07, |
| "loss": 91.7229, |
| "step": 2130 |
| }, |
| { |
| "epoch": 0.25592615900985943, |
| "grad_norm": 106.125, |
| "learning_rate": 8.268282935926468e-07, |
| "loss": 90.8414, |
| "step": 2135 |
| }, |
| { |
| "epoch": 0.2565255176960652, |
| "grad_norm": 103.875, |
| "learning_rate": 8.261622485680032e-07, |
| "loss": 91.5145, |
| "step": 2140 |
| }, |
| { |
| "epoch": 0.257124876382271, |
| "grad_norm": 101.5, |
| "learning_rate": 8.254962035433594e-07, |
| "loss": 91.8491, |
| "step": 2145 |
| }, |
| { |
| "epoch": 0.25772423506847675, |
| "grad_norm": 105.0, |
| "learning_rate": 8.248301585187158e-07, |
| "loss": 92.1283, |
| "step": 2150 |
| }, |
| { |
| "epoch": 0.2583235937546825, |
| "grad_norm": 105.1875, |
| "learning_rate": 8.241641134940721e-07, |
| "loss": 91.1556, |
| "step": 2155 |
| }, |
| { |
| "epoch": 0.25892295244088825, |
| "grad_norm": 102.25, |
| "learning_rate": 8.234980684694285e-07, |
| "loss": 93.0634, |
| "step": 2160 |
| }, |
| { |
| "epoch": 0.259522311127094, |
| "grad_norm": 101.25, |
| "learning_rate": 8.228320234447848e-07, |
| "loss": 91.6604, |
| "step": 2165 |
| }, |
| { |
| "epoch": 0.26012166981329976, |
| "grad_norm": 108.1875, |
| "learning_rate": 8.221659784201411e-07, |
| "loss": 90.7036, |
| "step": 2170 |
| }, |
| { |
| "epoch": 0.2607210284995055, |
| "grad_norm": 101.0, |
| "learning_rate": 8.214999333954975e-07, |
| "loss": 91.9036, |
| "step": 2175 |
| }, |
| { |
| "epoch": 0.26132038718571127, |
| "grad_norm": 103.9375, |
| "learning_rate": 8.208338883708538e-07, |
| "loss": 92.7012, |
| "step": 2180 |
| }, |
| { |
| "epoch": 0.2619197458719171, |
| "grad_norm": 107.0, |
| "learning_rate": 8.201678433462101e-07, |
| "loss": 91.6033, |
| "step": 2185 |
| }, |
| { |
| "epoch": 0.26251910455812283, |
| "grad_norm": 99.25, |
| "learning_rate": 8.195017983215665e-07, |
| "loss": 91.3827, |
| "step": 2190 |
| }, |
| { |
| "epoch": 0.2631184632443286, |
| "grad_norm": 103.375, |
| "learning_rate": 8.188357532969227e-07, |
| "loss": 91.5665, |
| "step": 2195 |
| }, |
| { |
| "epoch": 0.26371782193053434, |
| "grad_norm": 102.4375, |
| "learning_rate": 8.181697082722791e-07, |
| "loss": 91.2502, |
| "step": 2200 |
| }, |
| { |
| "epoch": 0.2643171806167401, |
| "grad_norm": 105.25, |
| "learning_rate": 8.175036632476355e-07, |
| "loss": 91.9588, |
| "step": 2205 |
| }, |
| { |
| "epoch": 0.26491653930294584, |
| "grad_norm": 102.9375, |
| "learning_rate": 8.168376182229918e-07, |
| "loss": 92.1473, |
| "step": 2210 |
| }, |
| { |
| "epoch": 0.2655158979891516, |
| "grad_norm": 101.6875, |
| "learning_rate": 8.161715731983481e-07, |
| "loss": 91.0215, |
| "step": 2215 |
| }, |
| { |
| "epoch": 0.26611525667535735, |
| "grad_norm": 103.25, |
| "learning_rate": 8.155055281737045e-07, |
| "loss": 91.4128, |
| "step": 2220 |
| }, |
| { |
| "epoch": 0.2667146153615631, |
| "grad_norm": 104.0625, |
| "learning_rate": 8.148394831490608e-07, |
| "loss": 91.6982, |
| "step": 2225 |
| }, |
| { |
| "epoch": 0.2673139740477689, |
| "grad_norm": 103.1875, |
| "learning_rate": 8.141734381244171e-07, |
| "loss": 91.3595, |
| "step": 2230 |
| }, |
| { |
| "epoch": 0.26791333273397466, |
| "grad_norm": 106.0, |
| "learning_rate": 8.135073930997734e-07, |
| "loss": 91.1334, |
| "step": 2235 |
| }, |
| { |
| "epoch": 0.2685126914201804, |
| "grad_norm": 105.8125, |
| "learning_rate": 8.128413480751298e-07, |
| "loss": 91.4205, |
| "step": 2240 |
| }, |
| { |
| "epoch": 0.26911205010638617, |
| "grad_norm": 105.125, |
| "learning_rate": 8.121753030504863e-07, |
| "loss": 91.8378, |
| "step": 2245 |
| }, |
| { |
| "epoch": 0.2697114087925919, |
| "grad_norm": 101.875, |
| "learning_rate": 8.115092580258424e-07, |
| "loss": 90.8697, |
| "step": 2250 |
| }, |
| { |
| "epoch": 0.2703107674787977, |
| "grad_norm": 101.75, |
| "learning_rate": 8.108432130011989e-07, |
| "loss": 90.6172, |
| "step": 2255 |
| }, |
| { |
| "epoch": 0.27091012616500343, |
| "grad_norm": 103.6875, |
| "learning_rate": 8.101771679765553e-07, |
| "loss": 90.3885, |
| "step": 2260 |
| }, |
| { |
| "epoch": 0.2715094848512092, |
| "grad_norm": 101.6875, |
| "learning_rate": 8.095111229519114e-07, |
| "loss": 91.2055, |
| "step": 2265 |
| }, |
| { |
| "epoch": 0.272108843537415, |
| "grad_norm": 104.3125, |
| "learning_rate": 8.088450779272679e-07, |
| "loss": 90.61, |
| "step": 2270 |
| }, |
| { |
| "epoch": 0.27270820222362074, |
| "grad_norm": 103.375, |
| "learning_rate": 8.081790329026243e-07, |
| "loss": 91.2788, |
| "step": 2275 |
| }, |
| { |
| "epoch": 0.2733075609098265, |
| "grad_norm": 104.1875, |
| "learning_rate": 8.075129878779805e-07, |
| "loss": 92.0951, |
| "step": 2280 |
| }, |
| { |
| "epoch": 0.27390691959603225, |
| "grad_norm": 104.375, |
| "learning_rate": 8.068469428533369e-07, |
| "loss": 90.7074, |
| "step": 2285 |
| }, |
| { |
| "epoch": 0.274506278282238, |
| "grad_norm": 104.625, |
| "learning_rate": 8.061808978286932e-07, |
| "loss": 89.8515, |
| "step": 2290 |
| }, |
| { |
| "epoch": 0.27510563696844376, |
| "grad_norm": 104.25, |
| "learning_rate": 8.055148528040496e-07, |
| "loss": 89.7757, |
| "step": 2295 |
| }, |
| { |
| "epoch": 0.2757049956546495, |
| "grad_norm": 102.8125, |
| "learning_rate": 8.048488077794059e-07, |
| "loss": 90.4891, |
| "step": 2300 |
| }, |
| { |
| "epoch": 0.27630435434085526, |
| "grad_norm": 102.0, |
| "learning_rate": 8.041827627547622e-07, |
| "loss": 90.5568, |
| "step": 2305 |
| }, |
| { |
| "epoch": 0.27690371302706107, |
| "grad_norm": 104.375, |
| "learning_rate": 8.035167177301186e-07, |
| "loss": 90.9858, |
| "step": 2310 |
| }, |
| { |
| "epoch": 0.2775030717132668, |
| "grad_norm": 106.8125, |
| "learning_rate": 8.028506727054749e-07, |
| "loss": 90.1024, |
| "step": 2315 |
| }, |
| { |
| "epoch": 0.2781024303994726, |
| "grad_norm": 102.25, |
| "learning_rate": 8.021846276808312e-07, |
| "loss": 91.8137, |
| "step": 2320 |
| }, |
| { |
| "epoch": 0.27870178908567833, |
| "grad_norm": 102.375, |
| "learning_rate": 8.015185826561876e-07, |
| "loss": 90.8296, |
| "step": 2325 |
| }, |
| { |
| "epoch": 0.2793011477718841, |
| "grad_norm": 99.4375, |
| "learning_rate": 8.008525376315438e-07, |
| "loss": 89.5984, |
| "step": 2330 |
| }, |
| { |
| "epoch": 0.27990050645808984, |
| "grad_norm": 104.0625, |
| "learning_rate": 8.001864926069002e-07, |
| "loss": 91.0086, |
| "step": 2335 |
| }, |
| { |
| "epoch": 0.2804998651442956, |
| "grad_norm": 104.6875, |
| "learning_rate": 7.995204475822566e-07, |
| "loss": 90.5621, |
| "step": 2340 |
| }, |
| { |
| "epoch": 0.28109922383050134, |
| "grad_norm": 102.5625, |
| "learning_rate": 7.988544025576129e-07, |
| "loss": 90.5947, |
| "step": 2345 |
| }, |
| { |
| "epoch": 0.2816985825167071, |
| "grad_norm": 103.0625, |
| "learning_rate": 7.981883575329692e-07, |
| "loss": 91.9352, |
| "step": 2350 |
| }, |
| { |
| "epoch": 0.2822979412029129, |
| "grad_norm": 102.8125, |
| "learning_rate": 7.975223125083256e-07, |
| "loss": 90.7241, |
| "step": 2355 |
| }, |
| { |
| "epoch": 0.28289729988911866, |
| "grad_norm": 103.1875, |
| "learning_rate": 7.968562674836819e-07, |
| "loss": 90.8843, |
| "step": 2360 |
| }, |
| { |
| "epoch": 0.2834966585753244, |
| "grad_norm": 105.6875, |
| "learning_rate": 7.961902224590382e-07, |
| "loss": 91.3989, |
| "step": 2365 |
| }, |
| { |
| "epoch": 0.28409601726153016, |
| "grad_norm": 100.6875, |
| "learning_rate": 7.955241774343945e-07, |
| "loss": 89.8323, |
| "step": 2370 |
| }, |
| { |
| "epoch": 0.2846953759477359, |
| "grad_norm": 101.5, |
| "learning_rate": 7.948581324097509e-07, |
| "loss": 90.1746, |
| "step": 2375 |
| }, |
| { |
| "epoch": 0.28529473463394167, |
| "grad_norm": 104.0, |
| "learning_rate": 7.941920873851073e-07, |
| "loss": 90.6545, |
| "step": 2380 |
| }, |
| { |
| "epoch": 0.2858940933201474, |
| "grad_norm": 104.6875, |
| "learning_rate": 7.935260423604635e-07, |
| "loss": 90.5792, |
| "step": 2385 |
| }, |
| { |
| "epoch": 0.2864934520063532, |
| "grad_norm": 104.625, |
| "learning_rate": 7.928599973358199e-07, |
| "loss": 91.1984, |
| "step": 2390 |
| }, |
| { |
| "epoch": 0.287092810692559, |
| "grad_norm": 104.5, |
| "learning_rate": 7.921939523111763e-07, |
| "loss": 90.3363, |
| "step": 2395 |
| }, |
| { |
| "epoch": 0.28769216937876474, |
| "grad_norm": 101.5, |
| "learning_rate": 7.915279072865325e-07, |
| "loss": 90.6117, |
| "step": 2400 |
| }, |
| { |
| "epoch": 0.2882915280649705, |
| "grad_norm": 104.875, |
| "learning_rate": 7.908618622618889e-07, |
| "loss": 89.2634, |
| "step": 2405 |
| }, |
| { |
| "epoch": 0.28889088675117625, |
| "grad_norm": 105.625, |
| "learning_rate": 7.901958172372452e-07, |
| "loss": 90.1148, |
| "step": 2410 |
| }, |
| { |
| "epoch": 0.289490245437382, |
| "grad_norm": 104.5625, |
| "learning_rate": 7.895297722126015e-07, |
| "loss": 91.1194, |
| "step": 2415 |
| }, |
| { |
| "epoch": 0.29008960412358775, |
| "grad_norm": 105.3125, |
| "learning_rate": 7.888637271879579e-07, |
| "loss": 90.17, |
| "step": 2420 |
| }, |
| { |
| "epoch": 0.2906889628097935, |
| "grad_norm": 105.6875, |
| "learning_rate": 7.881976821633142e-07, |
| "loss": 91.4705, |
| "step": 2425 |
| }, |
| { |
| "epoch": 0.29128832149599926, |
| "grad_norm": 101.8125, |
| "learning_rate": 7.875316371386706e-07, |
| "loss": 90.0706, |
| "step": 2430 |
| }, |
| { |
| "epoch": 0.29188768018220507, |
| "grad_norm": 102.5, |
| "learning_rate": 7.868655921140269e-07, |
| "loss": 89.9069, |
| "step": 2435 |
| }, |
| { |
| "epoch": 0.2924870388684108, |
| "grad_norm": 103.0625, |
| "learning_rate": 7.861995470893832e-07, |
| "loss": 89.592, |
| "step": 2440 |
| }, |
| { |
| "epoch": 0.2930863975546166, |
| "grad_norm": 101.5, |
| "learning_rate": 7.855335020647396e-07, |
| "loss": 90.5722, |
| "step": 2445 |
| }, |
| { |
| "epoch": 0.2936857562408223, |
| "grad_norm": 103.9375, |
| "learning_rate": 7.848674570400958e-07, |
| "loss": 90.2673, |
| "step": 2450 |
| }, |
| { |
| "epoch": 0.2942851149270281, |
| "grad_norm": 101.6875, |
| "learning_rate": 7.842014120154522e-07, |
| "loss": 90.2366, |
| "step": 2455 |
| }, |
| { |
| "epoch": 0.29488447361323383, |
| "grad_norm": 102.25, |
| "learning_rate": 7.835353669908086e-07, |
| "loss": 90.1811, |
| "step": 2460 |
| }, |
| { |
| "epoch": 0.2954838322994396, |
| "grad_norm": 102.375, |
| "learning_rate": 7.828693219661648e-07, |
| "loss": 91.0067, |
| "step": 2465 |
| }, |
| { |
| "epoch": 0.29608319098564534, |
| "grad_norm": 102.25, |
| "learning_rate": 7.822032769415212e-07, |
| "loss": 89.0682, |
| "step": 2470 |
| }, |
| { |
| "epoch": 0.2966825496718511, |
| "grad_norm": 103.25, |
| "learning_rate": 7.815372319168776e-07, |
| "loss": 90.7641, |
| "step": 2475 |
| }, |
| { |
| "epoch": 0.2972819083580569, |
| "grad_norm": 106.375, |
| "learning_rate": 7.808711868922339e-07, |
| "loss": 92.3741, |
| "step": 2480 |
| }, |
| { |
| "epoch": 0.29788126704426265, |
| "grad_norm": 108.0, |
| "learning_rate": 7.802051418675902e-07, |
| "loss": 90.1461, |
| "step": 2485 |
| }, |
| { |
| "epoch": 0.2984806257304684, |
| "grad_norm": 105.9375, |
| "learning_rate": 7.795390968429465e-07, |
| "loss": 91.2824, |
| "step": 2490 |
| }, |
| { |
| "epoch": 0.29907998441667416, |
| "grad_norm": 101.625, |
| "learning_rate": 7.788730518183029e-07, |
| "loss": 89.7306, |
| "step": 2495 |
| }, |
| { |
| "epoch": 0.2996793431028799, |
| "grad_norm": 103.3125, |
| "learning_rate": 7.782070067936592e-07, |
| "loss": 91.2063, |
| "step": 2500 |
| }, |
| { |
| "epoch": 0.2996793431028799, |
| "eval_loss": 2.8105251789093018, |
| "eval_runtime": 402.9918, |
| "eval_samples_per_second": 1115.663, |
| "eval_steps_per_second": 34.867, |
| "step": 2500 |
| }, |
| { |
| "epoch": 0.30027870178908567, |
| "grad_norm": 105.9375, |
| "learning_rate": 7.775409617690155e-07, |
| "loss": 89.4656, |
| "step": 2505 |
| }, |
| { |
| "epoch": 0.3008780604752914, |
| "grad_norm": 107.3125, |
| "learning_rate": 7.768749167443719e-07, |
| "loss": 89.8508, |
| "step": 2510 |
| }, |
| { |
| "epoch": 0.3014774191614972, |
| "grad_norm": 101.125, |
| "learning_rate": 7.762088717197283e-07, |
| "loss": 89.322, |
| "step": 2515 |
| }, |
| { |
| "epoch": 0.302076777847703, |
| "grad_norm": 105.875, |
| "learning_rate": 7.755428266950845e-07, |
| "loss": 89.2538, |
| "step": 2520 |
| }, |
| { |
| "epoch": 0.30267613653390874, |
| "grad_norm": 102.1875, |
| "learning_rate": 7.748767816704409e-07, |
| "loss": 89.4337, |
| "step": 2525 |
| }, |
| { |
| "epoch": 0.3032754952201145, |
| "grad_norm": 102.5, |
| "learning_rate": 7.742107366457972e-07, |
| "loss": 89.8735, |
| "step": 2530 |
| }, |
| { |
| "epoch": 0.30387485390632024, |
| "grad_norm": 102.375, |
| "learning_rate": 7.735446916211535e-07, |
| "loss": 90.7425, |
| "step": 2535 |
| }, |
| { |
| "epoch": 0.304474212592526, |
| "grad_norm": 105.75, |
| "learning_rate": 7.728786465965099e-07, |
| "loss": 89.6295, |
| "step": 2540 |
| }, |
| { |
| "epoch": 0.30507357127873175, |
| "grad_norm": 100.5625, |
| "learning_rate": 7.722126015718662e-07, |
| "loss": 89.3076, |
| "step": 2545 |
| }, |
| { |
| "epoch": 0.3056729299649375, |
| "grad_norm": 103.8125, |
| "learning_rate": 7.715465565472225e-07, |
| "loss": 89.8791, |
| "step": 2550 |
| }, |
| { |
| "epoch": 0.30627228865114325, |
| "grad_norm": 101.1875, |
| "learning_rate": 7.708805115225789e-07, |
| "loss": 89.6218, |
| "step": 2555 |
| }, |
| { |
| "epoch": 0.30687164733734906, |
| "grad_norm": 103.875, |
| "learning_rate": 7.702144664979352e-07, |
| "loss": 90.6524, |
| "step": 2560 |
| }, |
| { |
| "epoch": 0.3074710060235548, |
| "grad_norm": 101.0625, |
| "learning_rate": 7.695484214732916e-07, |
| "loss": 89.677, |
| "step": 2565 |
| }, |
| { |
| "epoch": 0.30807036470976057, |
| "grad_norm": 106.75, |
| "learning_rate": 7.688823764486478e-07, |
| "loss": 88.4502, |
| "step": 2570 |
| }, |
| { |
| "epoch": 0.3086697233959663, |
| "grad_norm": 105.125, |
| "learning_rate": 7.682163314240042e-07, |
| "loss": 91.3921, |
| "step": 2575 |
| }, |
| { |
| "epoch": 0.3092690820821721, |
| "grad_norm": 99.375, |
| "learning_rate": 7.675502863993606e-07, |
| "loss": 90.8827, |
| "step": 2580 |
| }, |
| { |
| "epoch": 0.30986844076837783, |
| "grad_norm": 101.6875, |
| "learning_rate": 7.668842413747168e-07, |
| "loss": 88.3922, |
| "step": 2585 |
| }, |
| { |
| "epoch": 0.3104677994545836, |
| "grad_norm": 102.4375, |
| "learning_rate": 7.662181963500732e-07, |
| "loss": 89.6833, |
| "step": 2590 |
| }, |
| { |
| "epoch": 0.31106715814078933, |
| "grad_norm": 103.125, |
| "learning_rate": 7.655521513254296e-07, |
| "loss": 88.9644, |
| "step": 2595 |
| }, |
| { |
| "epoch": 0.3116665168269951, |
| "grad_norm": 102.1875, |
| "learning_rate": 7.648861063007859e-07, |
| "loss": 88.5355, |
| "step": 2600 |
| }, |
| { |
| "epoch": 0.3122658755132009, |
| "grad_norm": 105.6875, |
| "learning_rate": 7.642200612761422e-07, |
| "loss": 89.3148, |
| "step": 2605 |
| }, |
| { |
| "epoch": 0.31286523419940665, |
| "grad_norm": 100.9375, |
| "learning_rate": 7.635540162514985e-07, |
| "loss": 89.9472, |
| "step": 2610 |
| }, |
| { |
| "epoch": 0.3134645928856124, |
| "grad_norm": 103.0, |
| "learning_rate": 7.628879712268549e-07, |
| "loss": 89.6862, |
| "step": 2615 |
| }, |
| { |
| "epoch": 0.31406395157181816, |
| "grad_norm": 102.6875, |
| "learning_rate": 7.622219262022112e-07, |
| "loss": 88.9315, |
| "step": 2620 |
| }, |
| { |
| "epoch": 0.3146633102580239, |
| "grad_norm": 102.4375, |
| "learning_rate": 7.615558811775675e-07, |
| "loss": 89.8684, |
| "step": 2625 |
| }, |
| { |
| "epoch": 0.31526266894422966, |
| "grad_norm": 102.75, |
| "learning_rate": 7.608898361529239e-07, |
| "loss": 89.971, |
| "step": 2630 |
| }, |
| { |
| "epoch": 0.3158620276304354, |
| "grad_norm": 104.6875, |
| "learning_rate": 7.602237911282802e-07, |
| "loss": 90.4864, |
| "step": 2635 |
| }, |
| { |
| "epoch": 0.31646138631664117, |
| "grad_norm": 102.25, |
| "learning_rate": 7.595577461036365e-07, |
| "loss": 89.3933, |
| "step": 2640 |
| }, |
| { |
| "epoch": 0.317060745002847, |
| "grad_norm": 107.5625, |
| "learning_rate": 7.588917010789929e-07, |
| "loss": 91.2062, |
| "step": 2645 |
| }, |
| { |
| "epoch": 0.31766010368905273, |
| "grad_norm": 104.1875, |
| "learning_rate": 7.582256560543493e-07, |
| "loss": 88.6704, |
| "step": 2650 |
| }, |
| { |
| "epoch": 0.3182594623752585, |
| "grad_norm": 100.75, |
| "learning_rate": 7.575596110297055e-07, |
| "loss": 88.1193, |
| "step": 2655 |
| }, |
| { |
| "epoch": 0.31885882106146424, |
| "grad_norm": 103.1875, |
| "learning_rate": 7.568935660050619e-07, |
| "loss": 89.7495, |
| "step": 2660 |
| }, |
| { |
| "epoch": 0.31945817974767, |
| "grad_norm": 104.3125, |
| "learning_rate": 7.562275209804182e-07, |
| "loss": 89.872, |
| "step": 2665 |
| }, |
| { |
| "epoch": 0.32005753843387574, |
| "grad_norm": 108.1875, |
| "learning_rate": 7.555614759557745e-07, |
| "loss": 88.5228, |
| "step": 2670 |
| }, |
| { |
| "epoch": 0.3206568971200815, |
| "grad_norm": 103.5625, |
| "learning_rate": 7.548954309311309e-07, |
| "loss": 89.145, |
| "step": 2675 |
| }, |
| { |
| "epoch": 0.32125625580628725, |
| "grad_norm": 101.625, |
| "learning_rate": 7.542293859064872e-07, |
| "loss": 87.3988, |
| "step": 2680 |
| }, |
| { |
| "epoch": 0.32185561449249306, |
| "grad_norm": 103.25, |
| "learning_rate": 7.535633408818435e-07, |
| "loss": 89.5909, |
| "step": 2685 |
| }, |
| { |
| "epoch": 0.3224549731786988, |
| "grad_norm": 102.5625, |
| "learning_rate": 7.528972958571999e-07, |
| "loss": 88.2594, |
| "step": 2690 |
| }, |
| { |
| "epoch": 0.32305433186490456, |
| "grad_norm": 103.125, |
| "learning_rate": 7.522312508325562e-07, |
| "loss": 90.1901, |
| "step": 2695 |
| }, |
| { |
| "epoch": 0.3236536905511103, |
| "grad_norm": 105.0, |
| "learning_rate": 7.515652058079126e-07, |
| "loss": 88.9892, |
| "step": 2700 |
| }, |
| { |
| "epoch": 0.32425304923731607, |
| "grad_norm": 106.0, |
| "learning_rate": 7.508991607832688e-07, |
| "loss": 88.7119, |
| "step": 2705 |
| }, |
| { |
| "epoch": 0.3248524079235218, |
| "grad_norm": 106.8125, |
| "learning_rate": 7.502331157586252e-07, |
| "loss": 90.3967, |
| "step": 2710 |
| }, |
| { |
| "epoch": 0.3254517666097276, |
| "grad_norm": 102.0, |
| "learning_rate": 7.495670707339817e-07, |
| "loss": 89.0769, |
| "step": 2715 |
| }, |
| { |
| "epoch": 0.32605112529593333, |
| "grad_norm": 101.375, |
| "learning_rate": 7.489010257093378e-07, |
| "loss": 88.1126, |
| "step": 2720 |
| }, |
| { |
| "epoch": 0.3266504839821391, |
| "grad_norm": 104.75, |
| "learning_rate": 7.482349806846942e-07, |
| "loss": 88.2526, |
| "step": 2725 |
| }, |
| { |
| "epoch": 0.3272498426683449, |
| "grad_norm": 99.625, |
| "learning_rate": 7.475689356600507e-07, |
| "loss": 88.5508, |
| "step": 2730 |
| }, |
| { |
| "epoch": 0.32784920135455065, |
| "grad_norm": 106.875, |
| "learning_rate": 7.46902890635407e-07, |
| "loss": 89.857, |
| "step": 2735 |
| }, |
| { |
| "epoch": 0.3284485600407564, |
| "grad_norm": 102.0625, |
| "learning_rate": 7.462368456107633e-07, |
| "loss": 88.2153, |
| "step": 2740 |
| }, |
| { |
| "epoch": 0.32904791872696215, |
| "grad_norm": 104.4375, |
| "learning_rate": 7.455708005861196e-07, |
| "loss": 89.4912, |
| "step": 2745 |
| }, |
| { |
| "epoch": 0.3296472774131679, |
| "grad_norm": 102.625, |
| "learning_rate": 7.44904755561476e-07, |
| "loss": 89.2615, |
| "step": 2750 |
| }, |
| { |
| "epoch": 0.33024663609937366, |
| "grad_norm": 104.8125, |
| "learning_rate": 7.442387105368323e-07, |
| "loss": 87.596, |
| "step": 2755 |
| }, |
| { |
| "epoch": 0.3308459947855794, |
| "grad_norm": 101.625, |
| "learning_rate": 7.435726655121886e-07, |
| "loss": 89.6667, |
| "step": 2760 |
| }, |
| { |
| "epoch": 0.33144535347178516, |
| "grad_norm": 106.3125, |
| "learning_rate": 7.42906620487545e-07, |
| "loss": 89.072, |
| "step": 2765 |
| }, |
| { |
| "epoch": 0.332044712157991, |
| "grad_norm": 102.3125, |
| "learning_rate": 7.422405754629013e-07, |
| "loss": 89.1722, |
| "step": 2770 |
| }, |
| { |
| "epoch": 0.3326440708441967, |
| "grad_norm": 102.8125, |
| "learning_rate": 7.415745304382576e-07, |
| "loss": 88.0948, |
| "step": 2775 |
| }, |
| { |
| "epoch": 0.3332434295304025, |
| "grad_norm": 105.5625, |
| "learning_rate": 7.40908485413614e-07, |
| "loss": 88.5523, |
| "step": 2780 |
| }, |
| { |
| "epoch": 0.33384278821660823, |
| "grad_norm": 99.75, |
| "learning_rate": 7.402424403889703e-07, |
| "loss": 88.9792, |
| "step": 2785 |
| }, |
| { |
| "epoch": 0.334442146902814, |
| "grad_norm": 101.5, |
| "learning_rate": 7.395763953643266e-07, |
| "loss": 89.6106, |
| "step": 2790 |
| }, |
| { |
| "epoch": 0.33504150558901974, |
| "grad_norm": 103.25, |
| "learning_rate": 7.38910350339683e-07, |
| "loss": 89.6081, |
| "step": 2795 |
| }, |
| { |
| "epoch": 0.3356408642752255, |
| "grad_norm": 105.625, |
| "learning_rate": 7.382443053150393e-07, |
| "loss": 89.297, |
| "step": 2800 |
| }, |
| { |
| "epoch": 0.33624022296143125, |
| "grad_norm": 104.25, |
| "learning_rate": 7.375782602903956e-07, |
| "loss": 87.779, |
| "step": 2805 |
| }, |
| { |
| "epoch": 0.33683958164763705, |
| "grad_norm": 102.0625, |
| "learning_rate": 7.36912215265752e-07, |
| "loss": 86.9398, |
| "step": 2810 |
| }, |
| { |
| "epoch": 0.3374389403338428, |
| "grad_norm": 106.5625, |
| "learning_rate": 7.362461702411083e-07, |
| "loss": 87.6691, |
| "step": 2815 |
| }, |
| { |
| "epoch": 0.33803829902004856, |
| "grad_norm": 104.6875, |
| "learning_rate": 7.355801252164647e-07, |
| "loss": 90.144, |
| "step": 2820 |
| }, |
| { |
| "epoch": 0.3386376577062543, |
| "grad_norm": 105.8125, |
| "learning_rate": 7.349140801918209e-07, |
| "loss": 88.8413, |
| "step": 2825 |
| }, |
| { |
| "epoch": 0.33923701639246007, |
| "grad_norm": 106.875, |
| "learning_rate": 7.342480351671773e-07, |
| "loss": 88.062, |
| "step": 2830 |
| }, |
| { |
| "epoch": 0.3398363750786658, |
| "grad_norm": 105.3125, |
| "learning_rate": 7.335819901425337e-07, |
| "loss": 89.8285, |
| "step": 2835 |
| }, |
| { |
| "epoch": 0.3404357337648716, |
| "grad_norm": 104.0625, |
| "learning_rate": 7.329159451178899e-07, |
| "loss": 88.6042, |
| "step": 2840 |
| }, |
| { |
| "epoch": 0.3410350924510773, |
| "grad_norm": 104.75, |
| "learning_rate": 7.322499000932463e-07, |
| "loss": 89.5081, |
| "step": 2845 |
| }, |
| { |
| "epoch": 0.3416344511372831, |
| "grad_norm": 103.0625, |
| "learning_rate": 7.315838550686027e-07, |
| "loss": 88.0691, |
| "step": 2850 |
| }, |
| { |
| "epoch": 0.3422338098234889, |
| "grad_norm": 103.6875, |
| "learning_rate": 7.309178100439589e-07, |
| "loss": 87.786, |
| "step": 2855 |
| }, |
| { |
| "epoch": 0.34283316850969464, |
| "grad_norm": 102.6875, |
| "learning_rate": 7.302517650193153e-07, |
| "loss": 88.005, |
| "step": 2860 |
| }, |
| { |
| "epoch": 0.3434325271959004, |
| "grad_norm": 101.3125, |
| "learning_rate": 7.295857199946716e-07, |
| "loss": 90.2978, |
| "step": 2865 |
| }, |
| { |
| "epoch": 0.34403188588210615, |
| "grad_norm": 104.8125, |
| "learning_rate": 7.28919674970028e-07, |
| "loss": 88.8875, |
| "step": 2870 |
| }, |
| { |
| "epoch": 0.3446312445683119, |
| "grad_norm": 104.375, |
| "learning_rate": 7.282536299453843e-07, |
| "loss": 88.095, |
| "step": 2875 |
| }, |
| { |
| "epoch": 0.34523060325451765, |
| "grad_norm": 105.5625, |
| "learning_rate": 7.275875849207406e-07, |
| "loss": 88.96, |
| "step": 2880 |
| }, |
| { |
| "epoch": 0.3458299619407234, |
| "grad_norm": 104.375, |
| "learning_rate": 7.26921539896097e-07, |
| "loss": 88.8805, |
| "step": 2885 |
| }, |
| { |
| "epoch": 0.34642932062692916, |
| "grad_norm": 104.1875, |
| "learning_rate": 7.262554948714533e-07, |
| "loss": 88.451, |
| "step": 2890 |
| }, |
| { |
| "epoch": 0.34702867931313497, |
| "grad_norm": 109.0, |
| "learning_rate": 7.255894498468096e-07, |
| "loss": 87.8166, |
| "step": 2895 |
| }, |
| { |
| "epoch": 0.3476280379993407, |
| "grad_norm": 101.25, |
| "learning_rate": 7.24923404822166e-07, |
| "loss": 87.2797, |
| "step": 2900 |
| }, |
| { |
| "epoch": 0.3482273966855465, |
| "grad_norm": 105.3125, |
| "learning_rate": 7.242573597975222e-07, |
| "loss": 87.5472, |
| "step": 2905 |
| }, |
| { |
| "epoch": 0.34882675537175223, |
| "grad_norm": 106.125, |
| "learning_rate": 7.235913147728786e-07, |
| "loss": 88.0739, |
| "step": 2910 |
| }, |
| { |
| "epoch": 0.349426114057958, |
| "grad_norm": 103.6875, |
| "learning_rate": 7.22925269748235e-07, |
| "loss": 88.6875, |
| "step": 2915 |
| }, |
| { |
| "epoch": 0.35002547274416373, |
| "grad_norm": 102.25, |
| "learning_rate": 7.222592247235913e-07, |
| "loss": 88.5786, |
| "step": 2920 |
| }, |
| { |
| "epoch": 0.3506248314303695, |
| "grad_norm": 108.1875, |
| "learning_rate": 7.215931796989476e-07, |
| "loss": 88.9486, |
| "step": 2925 |
| }, |
| { |
| "epoch": 0.35122419011657524, |
| "grad_norm": 104.25, |
| "learning_rate": 7.20927134674304e-07, |
| "loss": 88.3493, |
| "step": 2930 |
| }, |
| { |
| "epoch": 0.35182354880278105, |
| "grad_norm": 103.25, |
| "learning_rate": 7.202610896496603e-07, |
| "loss": 88.4265, |
| "step": 2935 |
| }, |
| { |
| "epoch": 0.3524229074889868, |
| "grad_norm": 101.8125, |
| "learning_rate": 7.195950446250166e-07, |
| "loss": 87.42, |
| "step": 2940 |
| }, |
| { |
| "epoch": 0.35302226617519256, |
| "grad_norm": 102.0625, |
| "learning_rate": 7.189289996003729e-07, |
| "loss": 86.7885, |
| "step": 2945 |
| }, |
| { |
| "epoch": 0.3536216248613983, |
| "grad_norm": 104.0, |
| "learning_rate": 7.182629545757293e-07, |
| "loss": 88.2069, |
| "step": 2950 |
| }, |
| { |
| "epoch": 0.35422098354760406, |
| "grad_norm": 104.875, |
| "learning_rate": 7.175969095510857e-07, |
| "loss": 86.0218, |
| "step": 2955 |
| }, |
| { |
| "epoch": 0.3548203422338098, |
| "grad_norm": 100.8125, |
| "learning_rate": 7.169308645264419e-07, |
| "loss": 87.1083, |
| "step": 2960 |
| }, |
| { |
| "epoch": 0.35541970092001557, |
| "grad_norm": 106.1875, |
| "learning_rate": 7.162648195017983e-07, |
| "loss": 88.2362, |
| "step": 2965 |
| }, |
| { |
| "epoch": 0.3560190596062213, |
| "grad_norm": 103.8125, |
| "learning_rate": 7.155987744771547e-07, |
| "loss": 87.8695, |
| "step": 2970 |
| }, |
| { |
| "epoch": 0.35661841829242713, |
| "grad_norm": 100.125, |
| "learning_rate": 7.149327294525109e-07, |
| "loss": 87.0618, |
| "step": 2975 |
| }, |
| { |
| "epoch": 0.3572177769786329, |
| "grad_norm": 104.5, |
| "learning_rate": 7.142666844278673e-07, |
| "loss": 88.6391, |
| "step": 2980 |
| }, |
| { |
| "epoch": 0.35781713566483864, |
| "grad_norm": 106.125, |
| "learning_rate": 7.136006394032237e-07, |
| "loss": 89.9931, |
| "step": 2985 |
| }, |
| { |
| "epoch": 0.3584164943510444, |
| "grad_norm": 105.9375, |
| "learning_rate": 7.129345943785799e-07, |
| "loss": 87.4074, |
| "step": 2990 |
| }, |
| { |
| "epoch": 0.35901585303725014, |
| "grad_norm": 104.5625, |
| "learning_rate": 7.122685493539363e-07, |
| "loss": 88.1837, |
| "step": 2995 |
| }, |
| { |
| "epoch": 0.3596152117234559, |
| "grad_norm": 106.9375, |
| "learning_rate": 7.116025043292926e-07, |
| "loss": 88.1768, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.3596152117234559, |
| "eval_loss": 2.7459769248962402, |
| "eval_runtime": 403.4871, |
| "eval_samples_per_second": 1114.293, |
| "eval_steps_per_second": 34.824, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.36021457040966165, |
| "grad_norm": 103.625, |
| "learning_rate": 7.10936459304649e-07, |
| "loss": 88.046, |
| "step": 3005 |
| }, |
| { |
| "epoch": 0.3608139290958674, |
| "grad_norm": 103.4375, |
| "learning_rate": 7.102704142800053e-07, |
| "loss": 88.8067, |
| "step": 3010 |
| }, |
| { |
| "epoch": 0.36141328778207316, |
| "grad_norm": 106.25, |
| "learning_rate": 7.096043692553616e-07, |
| "loss": 88.1824, |
| "step": 3015 |
| }, |
| { |
| "epoch": 0.36201264646827896, |
| "grad_norm": 101.0, |
| "learning_rate": 7.08938324230718e-07, |
| "loss": 87.2302, |
| "step": 3020 |
| }, |
| { |
| "epoch": 0.3626120051544847, |
| "grad_norm": 105.75, |
| "learning_rate": 7.082722792060743e-07, |
| "loss": 88.3142, |
| "step": 3025 |
| }, |
| { |
| "epoch": 0.36321136384069047, |
| "grad_norm": 105.5625, |
| "learning_rate": 7.076062341814306e-07, |
| "loss": 87.6098, |
| "step": 3030 |
| }, |
| { |
| "epoch": 0.3638107225268962, |
| "grad_norm": 103.125, |
| "learning_rate": 7.06940189156787e-07, |
| "loss": 88.7042, |
| "step": 3035 |
| }, |
| { |
| "epoch": 0.364410081213102, |
| "grad_norm": 104.0625, |
| "learning_rate": 7.062741441321432e-07, |
| "loss": 87.8503, |
| "step": 3040 |
| }, |
| { |
| "epoch": 0.36500943989930773, |
| "grad_norm": 102.75, |
| "learning_rate": 7.056080991074996e-07, |
| "loss": 87.639, |
| "step": 3045 |
| }, |
| { |
| "epoch": 0.3656087985855135, |
| "grad_norm": 105.1875, |
| "learning_rate": 7.04942054082856e-07, |
| "loss": 87.0323, |
| "step": 3050 |
| }, |
| { |
| "epoch": 0.36620815727171924, |
| "grad_norm": 102.0625, |
| "learning_rate": 7.042760090582123e-07, |
| "loss": 87.2938, |
| "step": 3055 |
| }, |
| { |
| "epoch": 0.36680751595792505, |
| "grad_norm": 102.625, |
| "learning_rate": 7.036099640335686e-07, |
| "loss": 87.2754, |
| "step": 3060 |
| }, |
| { |
| "epoch": 0.3674068746441308, |
| "grad_norm": 103.6875, |
| "learning_rate": 7.02943919008925e-07, |
| "loss": 87.5661, |
| "step": 3065 |
| }, |
| { |
| "epoch": 0.36800623333033655, |
| "grad_norm": 105.4375, |
| "learning_rate": 7.022778739842813e-07, |
| "loss": 87.7275, |
| "step": 3070 |
| }, |
| { |
| "epoch": 0.3686055920165423, |
| "grad_norm": 103.75, |
| "learning_rate": 7.016118289596376e-07, |
| "loss": 86.8911, |
| "step": 3075 |
| }, |
| { |
| "epoch": 0.36920495070274806, |
| "grad_norm": 103.6875, |
| "learning_rate": 7.009457839349939e-07, |
| "loss": 88.436, |
| "step": 3080 |
| }, |
| { |
| "epoch": 0.3698043093889538, |
| "grad_norm": 102.1875, |
| "learning_rate": 7.002797389103503e-07, |
| "loss": 87.3927, |
| "step": 3085 |
| }, |
| { |
| "epoch": 0.37040366807515956, |
| "grad_norm": 102.3125, |
| "learning_rate": 6.996136938857067e-07, |
| "loss": 86.9897, |
| "step": 3090 |
| }, |
| { |
| "epoch": 0.3710030267613653, |
| "grad_norm": 104.9375, |
| "learning_rate": 6.989476488610629e-07, |
| "loss": 87.0566, |
| "step": 3095 |
| }, |
| { |
| "epoch": 0.3716023854475711, |
| "grad_norm": 106.5, |
| "learning_rate": 6.982816038364193e-07, |
| "loss": 86.4962, |
| "step": 3100 |
| }, |
| { |
| "epoch": 0.3722017441337769, |
| "grad_norm": 104.75, |
| "learning_rate": 6.976155588117757e-07, |
| "loss": 87.2792, |
| "step": 3105 |
| }, |
| { |
| "epoch": 0.37280110281998263, |
| "grad_norm": 104.1875, |
| "learning_rate": 6.969495137871319e-07, |
| "loss": 88.572, |
| "step": 3110 |
| }, |
| { |
| "epoch": 0.3734004615061884, |
| "grad_norm": 105.3125, |
| "learning_rate": 6.962834687624883e-07, |
| "loss": 86.962, |
| "step": 3115 |
| }, |
| { |
| "epoch": 0.37399982019239414, |
| "grad_norm": 105.5, |
| "learning_rate": 6.956174237378446e-07, |
| "loss": 88.2113, |
| "step": 3120 |
| }, |
| { |
| "epoch": 0.3745991788785999, |
| "grad_norm": 107.125, |
| "learning_rate": 6.949513787132009e-07, |
| "loss": 86.343, |
| "step": 3125 |
| }, |
| { |
| "epoch": 0.37519853756480565, |
| "grad_norm": 106.875, |
| "learning_rate": 6.942853336885573e-07, |
| "loss": 85.8898, |
| "step": 3130 |
| }, |
| { |
| "epoch": 0.3757978962510114, |
| "grad_norm": 102.625, |
| "learning_rate": 6.936192886639136e-07, |
| "loss": 86.6348, |
| "step": 3135 |
| }, |
| { |
| "epoch": 0.37639725493721715, |
| "grad_norm": 103.3125, |
| "learning_rate": 6.9295324363927e-07, |
| "loss": 87.8125, |
| "step": 3140 |
| }, |
| { |
| "epoch": 0.37699661362342296, |
| "grad_norm": 102.1875, |
| "learning_rate": 6.922871986146263e-07, |
| "loss": 88.9906, |
| "step": 3145 |
| }, |
| { |
| "epoch": 0.3775959723096287, |
| "grad_norm": 101.1875, |
| "learning_rate": 6.916211535899826e-07, |
| "loss": 87.7328, |
| "step": 3150 |
| }, |
| { |
| "epoch": 0.37819533099583447, |
| "grad_norm": 105.875, |
| "learning_rate": 6.90955108565339e-07, |
| "loss": 87.4569, |
| "step": 3155 |
| }, |
| { |
| "epoch": 0.3787946896820402, |
| "grad_norm": 104.0625, |
| "learning_rate": 6.902890635406952e-07, |
| "loss": 87.0657, |
| "step": 3160 |
| }, |
| { |
| "epoch": 0.379394048368246, |
| "grad_norm": 108.6875, |
| "learning_rate": 6.896230185160516e-07, |
| "loss": 88.1625, |
| "step": 3165 |
| }, |
| { |
| "epoch": 0.3799934070544517, |
| "grad_norm": 103.875, |
| "learning_rate": 6.88956973491408e-07, |
| "loss": 87.7685, |
| "step": 3170 |
| }, |
| { |
| "epoch": 0.3805927657406575, |
| "grad_norm": 103.9375, |
| "learning_rate": 6.882909284667642e-07, |
| "loss": 86.1103, |
| "step": 3175 |
| }, |
| { |
| "epoch": 0.38119212442686323, |
| "grad_norm": 103.9375, |
| "learning_rate": 6.876248834421206e-07, |
| "loss": 87.3419, |
| "step": 3180 |
| }, |
| { |
| "epoch": 0.38179148311306904, |
| "grad_norm": 100.8125, |
| "learning_rate": 6.86958838417477e-07, |
| "loss": 87.24, |
| "step": 3185 |
| }, |
| { |
| "epoch": 0.3823908417992748, |
| "grad_norm": 105.8125, |
| "learning_rate": 6.862927933928333e-07, |
| "loss": 87.5693, |
| "step": 3190 |
| }, |
| { |
| "epoch": 0.38299020048548055, |
| "grad_norm": 103.9375, |
| "learning_rate": 6.856267483681896e-07, |
| "loss": 85.8992, |
| "step": 3195 |
| }, |
| { |
| "epoch": 0.3835895591716863, |
| "grad_norm": 102.0, |
| "learning_rate": 6.849607033435459e-07, |
| "loss": 86.0182, |
| "step": 3200 |
| }, |
| { |
| "epoch": 0.38418891785789205, |
| "grad_norm": 101.5, |
| "learning_rate": 6.842946583189024e-07, |
| "loss": 86.133, |
| "step": 3205 |
| }, |
| { |
| "epoch": 0.3847882765440978, |
| "grad_norm": 100.3125, |
| "learning_rate": 6.836286132942586e-07, |
| "loss": 86.924, |
| "step": 3210 |
| }, |
| { |
| "epoch": 0.38538763523030356, |
| "grad_norm": 105.5625, |
| "learning_rate": 6.82962568269615e-07, |
| "loss": 87.4621, |
| "step": 3215 |
| }, |
| { |
| "epoch": 0.3859869939165093, |
| "grad_norm": 104.3125, |
| "learning_rate": 6.822965232449714e-07, |
| "loss": 86.111, |
| "step": 3220 |
| }, |
| { |
| "epoch": 0.3865863526027151, |
| "grad_norm": 103.75, |
| "learning_rate": 6.816304782203278e-07, |
| "loss": 87.4626, |
| "step": 3225 |
| }, |
| { |
| "epoch": 0.3871857112889209, |
| "grad_norm": 106.0, |
| "learning_rate": 6.80964433195684e-07, |
| "loss": 88.3997, |
| "step": 3230 |
| }, |
| { |
| "epoch": 0.38778506997512663, |
| "grad_norm": 106.4375, |
| "learning_rate": 6.802983881710404e-07, |
| "loss": 87.3667, |
| "step": 3235 |
| }, |
| { |
| "epoch": 0.3883844286613324, |
| "grad_norm": 103.9375, |
| "learning_rate": 6.796323431463967e-07, |
| "loss": 86.2968, |
| "step": 3240 |
| }, |
| { |
| "epoch": 0.38898378734753813, |
| "grad_norm": 105.875, |
| "learning_rate": 6.78966298121753e-07, |
| "loss": 89.0274, |
| "step": 3245 |
| }, |
| { |
| "epoch": 0.3895831460337439, |
| "grad_norm": 101.75, |
| "learning_rate": 6.783002530971094e-07, |
| "loss": 85.2452, |
| "step": 3250 |
| }, |
| { |
| "epoch": 0.39018250471994964, |
| "grad_norm": 104.25, |
| "learning_rate": 6.776342080724657e-07, |
| "loss": 86.6051, |
| "step": 3255 |
| }, |
| { |
| "epoch": 0.3907818634061554, |
| "grad_norm": 104.875, |
| "learning_rate": 6.76968163047822e-07, |
| "loss": 85.579, |
| "step": 3260 |
| }, |
| { |
| "epoch": 0.39138122209236115, |
| "grad_norm": 103.4375, |
| "learning_rate": 6.763021180231784e-07, |
| "loss": 86.8788, |
| "step": 3265 |
| }, |
| { |
| "epoch": 0.39198058077856696, |
| "grad_norm": 105.0625, |
| "learning_rate": 6.756360729985347e-07, |
| "loss": 88.0645, |
| "step": 3270 |
| }, |
| { |
| "epoch": 0.3925799394647727, |
| "grad_norm": 103.25, |
| "learning_rate": 6.749700279738911e-07, |
| "loss": 87.0825, |
| "step": 3275 |
| }, |
| { |
| "epoch": 0.39317929815097846, |
| "grad_norm": 105.5, |
| "learning_rate": 6.743039829492473e-07, |
| "loss": 86.4466, |
| "step": 3280 |
| }, |
| { |
| "epoch": 0.3937786568371842, |
| "grad_norm": 105.0, |
| "learning_rate": 6.736379379246037e-07, |
| "loss": 87.9878, |
| "step": 3285 |
| }, |
| { |
| "epoch": 0.39437801552338997, |
| "grad_norm": 106.3125, |
| "learning_rate": 6.729718928999601e-07, |
| "loss": 87.5672, |
| "step": 3290 |
| }, |
| { |
| "epoch": 0.3949773742095957, |
| "grad_norm": 103.0625, |
| "learning_rate": 6.723058478753163e-07, |
| "loss": 87.317, |
| "step": 3295 |
| }, |
| { |
| "epoch": 0.3955767328958015, |
| "grad_norm": 102.1875, |
| "learning_rate": 6.716398028506727e-07, |
| "loss": 86.4539, |
| "step": 3300 |
| }, |
| { |
| "epoch": 0.39617609158200723, |
| "grad_norm": 99.5625, |
| "learning_rate": 6.709737578260291e-07, |
| "loss": 85.5183, |
| "step": 3305 |
| }, |
| { |
| "epoch": 0.39677545026821304, |
| "grad_norm": 101.5, |
| "learning_rate": 6.703077128013853e-07, |
| "loss": 86.3448, |
| "step": 3310 |
| }, |
| { |
| "epoch": 0.3973748089544188, |
| "grad_norm": 101.4375, |
| "learning_rate": 6.696416677767417e-07, |
| "loss": 87.9496, |
| "step": 3315 |
| }, |
| { |
| "epoch": 0.39797416764062454, |
| "grad_norm": 101.4375, |
| "learning_rate": 6.68975622752098e-07, |
| "loss": 86.568, |
| "step": 3320 |
| }, |
| { |
| "epoch": 0.3985735263268303, |
| "grad_norm": 100.75, |
| "learning_rate": 6.683095777274544e-07, |
| "loss": 84.8123, |
| "step": 3325 |
| }, |
| { |
| "epoch": 0.39917288501303605, |
| "grad_norm": 105.4375, |
| "learning_rate": 6.676435327028107e-07, |
| "loss": 87.7344, |
| "step": 3330 |
| }, |
| { |
| "epoch": 0.3997722436992418, |
| "grad_norm": 101.375, |
| "learning_rate": 6.66977487678167e-07, |
| "loss": 85.0206, |
| "step": 3335 |
| }, |
| { |
| "epoch": 0.40037160238544756, |
| "grad_norm": 105.3125, |
| "learning_rate": 6.663114426535234e-07, |
| "loss": 87.3425, |
| "step": 3340 |
| }, |
| { |
| "epoch": 0.4009709610716533, |
| "grad_norm": 104.25, |
| "learning_rate": 6.656453976288797e-07, |
| "loss": 86.7892, |
| "step": 3345 |
| }, |
| { |
| "epoch": 0.4015703197578591, |
| "grad_norm": 104.9375, |
| "learning_rate": 6.64979352604236e-07, |
| "loss": 85.9361, |
| "step": 3350 |
| }, |
| { |
| "epoch": 0.40216967844406487, |
| "grad_norm": 105.875, |
| "learning_rate": 6.643133075795924e-07, |
| "loss": 87.2027, |
| "step": 3355 |
| }, |
| { |
| "epoch": 0.4027690371302706, |
| "grad_norm": 102.6875, |
| "learning_rate": 6.636472625549488e-07, |
| "loss": 85.9342, |
| "step": 3360 |
| }, |
| { |
| "epoch": 0.4033683958164764, |
| "grad_norm": 104.875, |
| "learning_rate": 6.62981217530305e-07, |
| "loss": 85.8879, |
| "step": 3365 |
| }, |
| { |
| "epoch": 0.40396775450268213, |
| "grad_norm": 105.8125, |
| "learning_rate": 6.623151725056614e-07, |
| "loss": 86.2892, |
| "step": 3370 |
| }, |
| { |
| "epoch": 0.4045671131888879, |
| "grad_norm": 103.125, |
| "learning_rate": 6.616491274810177e-07, |
| "loss": 86.4097, |
| "step": 3375 |
| }, |
| { |
| "epoch": 0.40516647187509364, |
| "grad_norm": 106.625, |
| "learning_rate": 6.60983082456374e-07, |
| "loss": 86.6329, |
| "step": 3380 |
| }, |
| { |
| "epoch": 0.4057658305612994, |
| "grad_norm": 105.5625, |
| "learning_rate": 6.603170374317304e-07, |
| "loss": 85.9894, |
| "step": 3385 |
| }, |
| { |
| "epoch": 0.40636518924750514, |
| "grad_norm": 103.5625, |
| "learning_rate": 6.596509924070867e-07, |
| "loss": 86.8589, |
| "step": 3390 |
| }, |
| { |
| "epoch": 0.40696454793371095, |
| "grad_norm": 104.6875, |
| "learning_rate": 6.58984947382443e-07, |
| "loss": 85.3977, |
| "step": 3395 |
| }, |
| { |
| "epoch": 0.4075639066199167, |
| "grad_norm": 104.0625, |
| "learning_rate": 6.583189023577994e-07, |
| "loss": 85.5662, |
| "step": 3400 |
| }, |
| { |
| "epoch": 0.40816326530612246, |
| "grad_norm": 107.0625, |
| "learning_rate": 6.576528573331557e-07, |
| "loss": 87.1105, |
| "step": 3405 |
| }, |
| { |
| "epoch": 0.4087626239923282, |
| "grad_norm": 104.1875, |
| "learning_rate": 6.569868123085121e-07, |
| "loss": 86.211, |
| "step": 3410 |
| }, |
| { |
| "epoch": 0.40936198267853396, |
| "grad_norm": 102.5625, |
| "learning_rate": 6.563207672838683e-07, |
| "loss": 86.7257, |
| "step": 3415 |
| }, |
| { |
| "epoch": 0.4099613413647397, |
| "grad_norm": 106.1875, |
| "learning_rate": 6.556547222592247e-07, |
| "loss": 85.0465, |
| "step": 3420 |
| }, |
| { |
| "epoch": 0.41056070005094547, |
| "grad_norm": 104.1875, |
| "learning_rate": 6.549886772345811e-07, |
| "loss": 86.0544, |
| "step": 3425 |
| }, |
| { |
| "epoch": 0.4111600587371512, |
| "grad_norm": 100.5, |
| "learning_rate": 6.543226322099373e-07, |
| "loss": 84.8226, |
| "step": 3430 |
| }, |
| { |
| "epoch": 0.41175941742335703, |
| "grad_norm": 105.1875, |
| "learning_rate": 6.536565871852937e-07, |
| "loss": 86.4019, |
| "step": 3435 |
| }, |
| { |
| "epoch": 0.4123587761095628, |
| "grad_norm": 101.875, |
| "learning_rate": 6.529905421606501e-07, |
| "loss": 85.6632, |
| "step": 3440 |
| }, |
| { |
| "epoch": 0.41295813479576854, |
| "grad_norm": 103.75, |
| "learning_rate": 6.523244971360063e-07, |
| "loss": 87.07, |
| "step": 3445 |
| }, |
| { |
| "epoch": 0.4135574934819743, |
| "grad_norm": 111.1875, |
| "learning_rate": 6.516584521113627e-07, |
| "loss": 86.681, |
| "step": 3450 |
| }, |
| { |
| "epoch": 0.41415685216818005, |
| "grad_norm": 102.5, |
| "learning_rate": 6.50992407086719e-07, |
| "loss": 86.0613, |
| "step": 3455 |
| }, |
| { |
| "epoch": 0.4147562108543858, |
| "grad_norm": 103.4375, |
| "learning_rate": 6.503263620620754e-07, |
| "loss": 87.7511, |
| "step": 3460 |
| }, |
| { |
| "epoch": 0.41535556954059155, |
| "grad_norm": 103.5625, |
| "learning_rate": 6.496603170374317e-07, |
| "loss": 85.9475, |
| "step": 3465 |
| }, |
| { |
| "epoch": 0.4159549282267973, |
| "grad_norm": 104.75, |
| "learning_rate": 6.48994272012788e-07, |
| "loss": 84.7282, |
| "step": 3470 |
| }, |
| { |
| "epoch": 0.4165542869130031, |
| "grad_norm": 101.5625, |
| "learning_rate": 6.483282269881444e-07, |
| "loss": 85.3223, |
| "step": 3475 |
| }, |
| { |
| "epoch": 0.41715364559920887, |
| "grad_norm": 105.3125, |
| "learning_rate": 6.476621819635007e-07, |
| "loss": 85.0698, |
| "step": 3480 |
| }, |
| { |
| "epoch": 0.4177530042854146, |
| "grad_norm": 102.8125, |
| "learning_rate": 6.46996136938857e-07, |
| "loss": 86.7545, |
| "step": 3485 |
| }, |
| { |
| "epoch": 0.4183523629716204, |
| "grad_norm": 107.75, |
| "learning_rate": 6.463300919142134e-07, |
| "loss": 86.1996, |
| "step": 3490 |
| }, |
| { |
| "epoch": 0.4189517216578261, |
| "grad_norm": 101.125, |
| "learning_rate": 6.456640468895697e-07, |
| "loss": 86.6891, |
| "step": 3495 |
| }, |
| { |
| "epoch": 0.4195510803440319, |
| "grad_norm": 103.375, |
| "learning_rate": 6.44998001864926e-07, |
| "loss": 86.8633, |
| "step": 3500 |
| }, |
| { |
| "epoch": 0.4195510803440319, |
| "eval_loss": 2.689061403274536, |
| "eval_runtime": 405.5103, |
| "eval_samples_per_second": 1108.734, |
| "eval_steps_per_second": 34.65, |
| "step": 3500 |
| }, |
| { |
| "epoch": 0.42015043903023763, |
| "grad_norm": 104.875, |
| "learning_rate": 6.443319568402824e-07, |
| "loss": 86.7122, |
| "step": 3505 |
| }, |
| { |
| "epoch": 0.4207497977164434, |
| "grad_norm": 102.1875, |
| "learning_rate": 6.436659118156387e-07, |
| "loss": 85.7782, |
| "step": 3510 |
| }, |
| { |
| "epoch": 0.42134915640264914, |
| "grad_norm": 102.0, |
| "learning_rate": 6.42999866790995e-07, |
| "loss": 86.5698, |
| "step": 3515 |
| }, |
| { |
| "epoch": 0.42194851508885495, |
| "grad_norm": 106.0, |
| "learning_rate": 6.423338217663514e-07, |
| "loss": 85.1612, |
| "step": 3520 |
| }, |
| { |
| "epoch": 0.4225478737750607, |
| "grad_norm": 102.4375, |
| "learning_rate": 6.416677767417077e-07, |
| "loss": 86.2275, |
| "step": 3525 |
| }, |
| { |
| "epoch": 0.42314723246126645, |
| "grad_norm": 107.1875, |
| "learning_rate": 6.41001731717064e-07, |
| "loss": 84.2113, |
| "step": 3530 |
| }, |
| { |
| "epoch": 0.4237465911474722, |
| "grad_norm": 103.5, |
| "learning_rate": 6.403356866924203e-07, |
| "loss": 86.9985, |
| "step": 3535 |
| }, |
| { |
| "epoch": 0.42434594983367796, |
| "grad_norm": 105.875, |
| "learning_rate": 6.396696416677767e-07, |
| "loss": 84.5887, |
| "step": 3540 |
| }, |
| { |
| "epoch": 0.4249453085198837, |
| "grad_norm": 103.375, |
| "learning_rate": 6.390035966431331e-07, |
| "loss": 86.1314, |
| "step": 3545 |
| }, |
| { |
| "epoch": 0.42554466720608947, |
| "grad_norm": 107.25, |
| "learning_rate": 6.383375516184893e-07, |
| "loss": 86.0269, |
| "step": 3550 |
| }, |
| { |
| "epoch": 0.4261440258922952, |
| "grad_norm": 106.0, |
| "learning_rate": 6.376715065938457e-07, |
| "loss": 86.8127, |
| "step": 3555 |
| }, |
| { |
| "epoch": 0.42674338457850103, |
| "grad_norm": 105.6875, |
| "learning_rate": 6.370054615692021e-07, |
| "loss": 86.3978, |
| "step": 3560 |
| }, |
| { |
| "epoch": 0.4273427432647068, |
| "grad_norm": 103.9375, |
| "learning_rate": 6.363394165445583e-07, |
| "loss": 83.8842, |
| "step": 3565 |
| }, |
| { |
| "epoch": 0.42794210195091253, |
| "grad_norm": 104.6875, |
| "learning_rate": 6.356733715199147e-07, |
| "loss": 85.6649, |
| "step": 3570 |
| }, |
| { |
| "epoch": 0.4285414606371183, |
| "grad_norm": 106.3125, |
| "learning_rate": 6.35007326495271e-07, |
| "loss": 86.1648, |
| "step": 3575 |
| }, |
| { |
| "epoch": 0.42914081932332404, |
| "grad_norm": 104.5, |
| "learning_rate": 6.343412814706273e-07, |
| "loss": 84.8243, |
| "step": 3580 |
| }, |
| { |
| "epoch": 0.4297401780095298, |
| "grad_norm": 103.125, |
| "learning_rate": 6.336752364459837e-07, |
| "loss": 86.0243, |
| "step": 3585 |
| }, |
| { |
| "epoch": 0.43033953669573555, |
| "grad_norm": 102.9375, |
| "learning_rate": 6.3300919142134e-07, |
| "loss": 85.5283, |
| "step": 3590 |
| }, |
| { |
| "epoch": 0.4309388953819413, |
| "grad_norm": 100.0625, |
| "learning_rate": 6.323431463966964e-07, |
| "loss": 84.9534, |
| "step": 3595 |
| }, |
| { |
| "epoch": 0.4315382540681471, |
| "grad_norm": 102.0, |
| "learning_rate": 6.316771013720527e-07, |
| "loss": 86.3156, |
| "step": 3600 |
| }, |
| { |
| "epoch": 0.43213761275435286, |
| "grad_norm": 102.0625, |
| "learning_rate": 6.31011056347409e-07, |
| "loss": 85.5763, |
| "step": 3605 |
| }, |
| { |
| "epoch": 0.4327369714405586, |
| "grad_norm": 101.0625, |
| "learning_rate": 6.303450113227654e-07, |
| "loss": 86.5275, |
| "step": 3610 |
| }, |
| { |
| "epoch": 0.43333633012676437, |
| "grad_norm": 108.0, |
| "learning_rate": 6.296789662981216e-07, |
| "loss": 85.2912, |
| "step": 3615 |
| }, |
| { |
| "epoch": 0.4339356888129701, |
| "grad_norm": 102.875, |
| "learning_rate": 6.29012921273478e-07, |
| "loss": 85.6715, |
| "step": 3620 |
| }, |
| { |
| "epoch": 0.4345350474991759, |
| "grad_norm": 102.8125, |
| "learning_rate": 6.283468762488344e-07, |
| "loss": 85.735, |
| "step": 3625 |
| }, |
| { |
| "epoch": 0.43513440618538163, |
| "grad_norm": 103.25, |
| "learning_rate": 6.276808312241907e-07, |
| "loss": 84.4985, |
| "step": 3630 |
| }, |
| { |
| "epoch": 0.4357337648715874, |
| "grad_norm": 101.5625, |
| "learning_rate": 6.27014786199547e-07, |
| "loss": 85.695, |
| "step": 3635 |
| }, |
| { |
| "epoch": 0.43633312355779313, |
| "grad_norm": 104.875, |
| "learning_rate": 6.263487411749034e-07, |
| "loss": 85.4679, |
| "step": 3640 |
| }, |
| { |
| "epoch": 0.43693248224399894, |
| "grad_norm": 101.625, |
| "learning_rate": 6.256826961502597e-07, |
| "loss": 86.5071, |
| "step": 3645 |
| }, |
| { |
| "epoch": 0.4375318409302047, |
| "grad_norm": 108.875, |
| "learning_rate": 6.25016651125616e-07, |
| "loss": 85.3634, |
| "step": 3650 |
| }, |
| { |
| "epoch": 0.43813119961641045, |
| "grad_norm": 104.0625, |
| "learning_rate": 6.243506061009723e-07, |
| "loss": 84.9664, |
| "step": 3655 |
| }, |
| { |
| "epoch": 0.4387305583026162, |
| "grad_norm": 103.3125, |
| "learning_rate": 6.236845610763287e-07, |
| "loss": 85.6819, |
| "step": 3660 |
| }, |
| { |
| "epoch": 0.43932991698882196, |
| "grad_norm": 108.125, |
| "learning_rate": 6.23018516051685e-07, |
| "loss": 86.4368, |
| "step": 3665 |
| }, |
| { |
| "epoch": 0.4399292756750277, |
| "grad_norm": 107.375, |
| "learning_rate": 6.223524710270413e-07, |
| "loss": 84.9657, |
| "step": 3670 |
| }, |
| { |
| "epoch": 0.44052863436123346, |
| "grad_norm": 103.1875, |
| "learning_rate": 6.216864260023977e-07, |
| "loss": 84.2119, |
| "step": 3675 |
| }, |
| { |
| "epoch": 0.4411279930474392, |
| "grad_norm": 103.6875, |
| "learning_rate": 6.210203809777542e-07, |
| "loss": 85.9094, |
| "step": 3680 |
| }, |
| { |
| "epoch": 0.441727351733645, |
| "grad_norm": 102.5, |
| "learning_rate": 6.203543359531103e-07, |
| "loss": 84.6232, |
| "step": 3685 |
| }, |
| { |
| "epoch": 0.4423267104198508, |
| "grad_norm": 103.5625, |
| "learning_rate": 6.196882909284668e-07, |
| "loss": 86.2957, |
| "step": 3690 |
| }, |
| { |
| "epoch": 0.44292606910605653, |
| "grad_norm": 105.0625, |
| "learning_rate": 6.190222459038232e-07, |
| "loss": 84.6409, |
| "step": 3695 |
| }, |
| { |
| "epoch": 0.4435254277922623, |
| "grad_norm": 109.5625, |
| "learning_rate": 6.183562008791793e-07, |
| "loss": 86.1193, |
| "step": 3700 |
| }, |
| { |
| "epoch": 0.44412478647846804, |
| "grad_norm": 103.125, |
| "learning_rate": 6.176901558545358e-07, |
| "loss": 85.6869, |
| "step": 3705 |
| }, |
| { |
| "epoch": 0.4447241451646738, |
| "grad_norm": 106.5625, |
| "learning_rate": 6.170241108298921e-07, |
| "loss": 86.3559, |
| "step": 3710 |
| }, |
| { |
| "epoch": 0.44532350385087954, |
| "grad_norm": 103.0, |
| "learning_rate": 6.163580658052485e-07, |
| "loss": 84.8422, |
| "step": 3715 |
| }, |
| { |
| "epoch": 0.4459228625370853, |
| "grad_norm": 107.5625, |
| "learning_rate": 6.156920207806048e-07, |
| "loss": 85.5576, |
| "step": 3720 |
| }, |
| { |
| "epoch": 0.4465222212232911, |
| "grad_norm": 104.125, |
| "learning_rate": 6.150259757559611e-07, |
| "loss": 84.6006, |
| "step": 3725 |
| }, |
| { |
| "epoch": 0.44712157990949686, |
| "grad_norm": 105.875, |
| "learning_rate": 6.143599307313175e-07, |
| "loss": 86.1678, |
| "step": 3730 |
| }, |
| { |
| "epoch": 0.4477209385957026, |
| "grad_norm": 104.25, |
| "learning_rate": 6.136938857066738e-07, |
| "loss": 84.4279, |
| "step": 3735 |
| }, |
| { |
| "epoch": 0.44832029728190836, |
| "grad_norm": 105.3125, |
| "learning_rate": 6.130278406820301e-07, |
| "loss": 84.0661, |
| "step": 3740 |
| }, |
| { |
| "epoch": 0.4489196559681141, |
| "grad_norm": 103.625, |
| "learning_rate": 6.123617956573865e-07, |
| "loss": 84.5492, |
| "step": 3745 |
| }, |
| { |
| "epoch": 0.44951901465431987, |
| "grad_norm": 102.375, |
| "learning_rate": 6.116957506327427e-07, |
| "loss": 85.9095, |
| "step": 3750 |
| }, |
| { |
| "epoch": 0.4501183733405256, |
| "grad_norm": 108.3125, |
| "learning_rate": 6.110297056080991e-07, |
| "loss": 85.9708, |
| "step": 3755 |
| }, |
| { |
| "epoch": 0.4507177320267314, |
| "grad_norm": 103.875, |
| "learning_rate": 6.103636605834555e-07, |
| "loss": 86.4198, |
| "step": 3760 |
| }, |
| { |
| "epoch": 0.45131709071293713, |
| "grad_norm": 103.1875, |
| "learning_rate": 6.096976155588118e-07, |
| "loss": 86.0834, |
| "step": 3765 |
| }, |
| { |
| "epoch": 0.45191644939914294, |
| "grad_norm": 106.875, |
| "learning_rate": 6.090315705341681e-07, |
| "loss": 84.9955, |
| "step": 3770 |
| }, |
| { |
| "epoch": 0.4525158080853487, |
| "grad_norm": 105.625, |
| "learning_rate": 6.083655255095245e-07, |
| "loss": 85.3397, |
| "step": 3775 |
| }, |
| { |
| "epoch": 0.45311516677155445, |
| "grad_norm": 104.5, |
| "learning_rate": 6.076994804848808e-07, |
| "loss": 86.0351, |
| "step": 3780 |
| }, |
| { |
| "epoch": 0.4537145254577602, |
| "grad_norm": 100.875, |
| "learning_rate": 6.070334354602371e-07, |
| "loss": 84.0522, |
| "step": 3785 |
| }, |
| { |
| "epoch": 0.45431388414396595, |
| "grad_norm": 104.75, |
| "learning_rate": 6.063673904355934e-07, |
| "loss": 83.8673, |
| "step": 3790 |
| }, |
| { |
| "epoch": 0.4549132428301717, |
| "grad_norm": 105.625, |
| "learning_rate": 6.057013454109498e-07, |
| "loss": 84.8843, |
| "step": 3795 |
| }, |
| { |
| "epoch": 0.45551260151637746, |
| "grad_norm": 102.375, |
| "learning_rate": 6.050353003863061e-07, |
| "loss": 85.575, |
| "step": 3800 |
| }, |
| { |
| "epoch": 0.4561119602025832, |
| "grad_norm": 105.375, |
| "learning_rate": 6.043692553616624e-07, |
| "loss": 85.2, |
| "step": 3805 |
| }, |
| { |
| "epoch": 0.456711318888789, |
| "grad_norm": 103.125, |
| "learning_rate": 6.037032103370188e-07, |
| "loss": 86.2768, |
| "step": 3810 |
| }, |
| { |
| "epoch": 0.4573106775749948, |
| "grad_norm": 104.5, |
| "learning_rate": 6.030371653123752e-07, |
| "loss": 85.3179, |
| "step": 3815 |
| }, |
| { |
| "epoch": 0.4579100362612005, |
| "grad_norm": 105.6875, |
| "learning_rate": 6.023711202877314e-07, |
| "loss": 85.2562, |
| "step": 3820 |
| }, |
| { |
| "epoch": 0.4585093949474063, |
| "grad_norm": 104.625, |
| "learning_rate": 6.017050752630878e-07, |
| "loss": 84.96, |
| "step": 3825 |
| }, |
| { |
| "epoch": 0.45910875363361203, |
| "grad_norm": 107.75, |
| "learning_rate": 6.010390302384441e-07, |
| "loss": 84.189, |
| "step": 3830 |
| }, |
| { |
| "epoch": 0.4597081123198178, |
| "grad_norm": 101.875, |
| "learning_rate": 6.003729852138004e-07, |
| "loss": 83.2708, |
| "step": 3835 |
| }, |
| { |
| "epoch": 0.46030747100602354, |
| "grad_norm": 103.5, |
| "learning_rate": 5.997069401891568e-07, |
| "loss": 87.3057, |
| "step": 3840 |
| }, |
| { |
| "epoch": 0.4609068296922293, |
| "grad_norm": 103.9375, |
| "learning_rate": 5.990408951645131e-07, |
| "loss": 84.9138, |
| "step": 3845 |
| }, |
| { |
| "epoch": 0.4615061883784351, |
| "grad_norm": 103.9375, |
| "learning_rate": 5.983748501398695e-07, |
| "loss": 85.4947, |
| "step": 3850 |
| }, |
| { |
| "epoch": 0.46210554706464085, |
| "grad_norm": 101.9375, |
| "learning_rate": 5.977088051152258e-07, |
| "loss": 84.7751, |
| "step": 3855 |
| }, |
| { |
| "epoch": 0.4627049057508466, |
| "grad_norm": 104.0, |
| "learning_rate": 5.970427600905821e-07, |
| "loss": 84.721, |
| "step": 3860 |
| }, |
| { |
| "epoch": 0.46330426443705236, |
| "grad_norm": 103.3125, |
| "learning_rate": 5.963767150659385e-07, |
| "loss": 84.749, |
| "step": 3865 |
| }, |
| { |
| "epoch": 0.4639036231232581, |
| "grad_norm": 109.5, |
| "learning_rate": 5.957106700412947e-07, |
| "loss": 83.0867, |
| "step": 3870 |
| }, |
| { |
| "epoch": 0.46450298180946387, |
| "grad_norm": 105.9375, |
| "learning_rate": 5.950446250166511e-07, |
| "loss": 84.9955, |
| "step": 3875 |
| }, |
| { |
| "epoch": 0.4651023404956696, |
| "grad_norm": 107.1875, |
| "learning_rate": 5.943785799920075e-07, |
| "loss": 85.8307, |
| "step": 3880 |
| }, |
| { |
| "epoch": 0.4657016991818754, |
| "grad_norm": 101.9375, |
| "learning_rate": 5.937125349673637e-07, |
| "loss": 84.0253, |
| "step": 3885 |
| }, |
| { |
| "epoch": 0.4663010578680811, |
| "grad_norm": 103.1875, |
| "learning_rate": 5.930464899427201e-07, |
| "loss": 85.0021, |
| "step": 3890 |
| }, |
| { |
| "epoch": 0.46690041655428693, |
| "grad_norm": 102.0625, |
| "learning_rate": 5.923804449180765e-07, |
| "loss": 84.1141, |
| "step": 3895 |
| }, |
| { |
| "epoch": 0.4674997752404927, |
| "grad_norm": 103.375, |
| "learning_rate": 5.917143998934328e-07, |
| "loss": 82.9194, |
| "step": 3900 |
| }, |
| { |
| "epoch": 0.46809913392669844, |
| "grad_norm": 105.25, |
| "learning_rate": 5.910483548687891e-07, |
| "loss": 84.037, |
| "step": 3905 |
| }, |
| { |
| "epoch": 0.4686984926129042, |
| "grad_norm": 106.25, |
| "learning_rate": 5.903823098441454e-07, |
| "loss": 84.8373, |
| "step": 3910 |
| }, |
| { |
| "epoch": 0.46929785129910995, |
| "grad_norm": 105.75, |
| "learning_rate": 5.897162648195018e-07, |
| "loss": 85.4891, |
| "step": 3915 |
| }, |
| { |
| "epoch": 0.4698972099853157, |
| "grad_norm": 100.9375, |
| "learning_rate": 5.890502197948581e-07, |
| "loss": 84.3028, |
| "step": 3920 |
| }, |
| { |
| "epoch": 0.47049656867152145, |
| "grad_norm": 106.0625, |
| "learning_rate": 5.883841747702144e-07, |
| "loss": 84.8452, |
| "step": 3925 |
| }, |
| { |
| "epoch": 0.4710959273577272, |
| "grad_norm": 105.5625, |
| "learning_rate": 5.877181297455708e-07, |
| "loss": 84.6139, |
| "step": 3930 |
| }, |
| { |
| "epoch": 0.471695286043933, |
| "grad_norm": 107.0, |
| "learning_rate": 5.870520847209272e-07, |
| "loss": 84.4856, |
| "step": 3935 |
| }, |
| { |
| "epoch": 0.47229464473013877, |
| "grad_norm": 103.5, |
| "learning_rate": 5.863860396962834e-07, |
| "loss": 85.6233, |
| "step": 3940 |
| }, |
| { |
| "epoch": 0.4728940034163445, |
| "grad_norm": 102.375, |
| "learning_rate": 5.857199946716398e-07, |
| "loss": 84.1834, |
| "step": 3945 |
| }, |
| { |
| "epoch": 0.4734933621025503, |
| "grad_norm": 108.25, |
| "learning_rate": 5.850539496469961e-07, |
| "loss": 83.889, |
| "step": 3950 |
| }, |
| { |
| "epoch": 0.47409272078875603, |
| "grad_norm": 102.9375, |
| "learning_rate": 5.843879046223524e-07, |
| "loss": 83.3476, |
| "step": 3955 |
| }, |
| { |
| "epoch": 0.4746920794749618, |
| "grad_norm": 103.125, |
| "learning_rate": 5.837218595977088e-07, |
| "loss": 84.898, |
| "step": 3960 |
| }, |
| { |
| "epoch": 0.47529143816116753, |
| "grad_norm": 102.625, |
| "learning_rate": 5.830558145730651e-07, |
| "loss": 84.5597, |
| "step": 3965 |
| }, |
| { |
| "epoch": 0.4758907968473733, |
| "grad_norm": 104.0625, |
| "learning_rate": 5.823897695484214e-07, |
| "loss": 84.7598, |
| "step": 3970 |
| }, |
| { |
| "epoch": 0.4764901555335791, |
| "grad_norm": 101.6875, |
| "learning_rate": 5.817237245237778e-07, |
| "loss": 84.7258, |
| "step": 3975 |
| }, |
| { |
| "epoch": 0.47708951421978485, |
| "grad_norm": 105.375, |
| "learning_rate": 5.810576794991341e-07, |
| "loss": 84.1123, |
| "step": 3980 |
| }, |
| { |
| "epoch": 0.4776888729059906, |
| "grad_norm": 107.9375, |
| "learning_rate": 5.803916344744905e-07, |
| "loss": 83.9246, |
| "step": 3985 |
| }, |
| { |
| "epoch": 0.47828823159219636, |
| "grad_norm": 102.4375, |
| "learning_rate": 5.797255894498467e-07, |
| "loss": 83.9059, |
| "step": 3990 |
| }, |
| { |
| "epoch": 0.4788875902784021, |
| "grad_norm": 104.1875, |
| "learning_rate": 5.790595444252031e-07, |
| "loss": 83.2477, |
| "step": 3995 |
| }, |
| { |
| "epoch": 0.47948694896460786, |
| "grad_norm": 105.9375, |
| "learning_rate": 5.783934994005595e-07, |
| "loss": 84.4198, |
| "step": 4000 |
| }, |
| { |
| "epoch": 0.47948694896460786, |
| "eval_loss": 2.6376805305480957, |
| "eval_runtime": 403.2703, |
| "eval_samples_per_second": 1114.892, |
| "eval_steps_per_second": 34.843, |
| "step": 4000 |
| }, |
| { |
| "epoch": 0.4800863076508136, |
| "grad_norm": 103.5625, |
| "learning_rate": 5.777274543759157e-07, |
| "loss": 83.7404, |
| "step": 4005 |
| }, |
| { |
| "epoch": 0.48068566633701937, |
| "grad_norm": 99.5625, |
| "learning_rate": 5.770614093512721e-07, |
| "loss": 84.5335, |
| "step": 4010 |
| }, |
| { |
| "epoch": 0.4812850250232251, |
| "grad_norm": 102.1875, |
| "learning_rate": 5.763953643266285e-07, |
| "loss": 84.0931, |
| "step": 4015 |
| }, |
| { |
| "epoch": 0.48188438370943093, |
| "grad_norm": 106.8125, |
| "learning_rate": 5.757293193019847e-07, |
| "loss": 83.4464, |
| "step": 4020 |
| }, |
| { |
| "epoch": 0.4824837423956367, |
| "grad_norm": 105.25, |
| "learning_rate": 5.750632742773411e-07, |
| "loss": 84.1755, |
| "step": 4025 |
| }, |
| { |
| "epoch": 0.48308310108184244, |
| "grad_norm": 105.25, |
| "learning_rate": 5.743972292526974e-07, |
| "loss": 82.9136, |
| "step": 4030 |
| }, |
| { |
| "epoch": 0.4836824597680482, |
| "grad_norm": 103.3125, |
| "learning_rate": 5.737311842280538e-07, |
| "loss": 84.5602, |
| "step": 4035 |
| }, |
| { |
| "epoch": 0.48428181845425394, |
| "grad_norm": 103.75, |
| "learning_rate": 5.730651392034101e-07, |
| "loss": 84.7828, |
| "step": 4040 |
| }, |
| { |
| "epoch": 0.4848811771404597, |
| "grad_norm": 106.5, |
| "learning_rate": 5.723990941787664e-07, |
| "loss": 83.2263, |
| "step": 4045 |
| }, |
| { |
| "epoch": 0.48548053582666545, |
| "grad_norm": 107.5, |
| "learning_rate": 5.717330491541228e-07, |
| "loss": 82.3737, |
| "step": 4050 |
| }, |
| { |
| "epoch": 0.4860798945128712, |
| "grad_norm": 104.8125, |
| "learning_rate": 5.710670041294791e-07, |
| "loss": 83.2381, |
| "step": 4055 |
| }, |
| { |
| "epoch": 0.486679253199077, |
| "grad_norm": 107.6875, |
| "learning_rate": 5.704009591048354e-07, |
| "loss": 85.2892, |
| "step": 4060 |
| }, |
| { |
| "epoch": 0.48727861188528276, |
| "grad_norm": 106.0, |
| "learning_rate": 5.697349140801918e-07, |
| "loss": 85.1233, |
| "step": 4065 |
| }, |
| { |
| "epoch": 0.4878779705714885, |
| "grad_norm": 105.1875, |
| "learning_rate": 5.690688690555482e-07, |
| "loss": 83.7537, |
| "step": 4070 |
| }, |
| { |
| "epoch": 0.48847732925769427, |
| "grad_norm": 103.8125, |
| "learning_rate": 5.684028240309044e-07, |
| "loss": 84.1038, |
| "step": 4075 |
| }, |
| { |
| "epoch": 0.4890766879439, |
| "grad_norm": 107.5625, |
| "learning_rate": 5.677367790062608e-07, |
| "loss": 84.5125, |
| "step": 4080 |
| }, |
| { |
| "epoch": 0.4896760466301058, |
| "grad_norm": 102.0625, |
| "learning_rate": 5.670707339816171e-07, |
| "loss": 82.8591, |
| "step": 4085 |
| }, |
| { |
| "epoch": 0.49027540531631153, |
| "grad_norm": 104.25, |
| "learning_rate": 5.664046889569734e-07, |
| "loss": 84.0393, |
| "step": 4090 |
| }, |
| { |
| "epoch": 0.4908747640025173, |
| "grad_norm": 105.6875, |
| "learning_rate": 5.657386439323298e-07, |
| "loss": 82.1754, |
| "step": 4095 |
| }, |
| { |
| "epoch": 0.4914741226887231, |
| "grad_norm": 107.25, |
| "learning_rate": 5.650725989076861e-07, |
| "loss": 85.7898, |
| "step": 4100 |
| }, |
| { |
| "epoch": 0.49207348137492884, |
| "grad_norm": 103.375, |
| "learning_rate": 5.644065538830424e-07, |
| "loss": 83.8735, |
| "step": 4105 |
| }, |
| { |
| "epoch": 0.4926728400611346, |
| "grad_norm": 102.375, |
| "learning_rate": 5.637405088583988e-07, |
| "loss": 83.6745, |
| "step": 4110 |
| }, |
| { |
| "epoch": 0.49327219874734035, |
| "grad_norm": 104.0625, |
| "learning_rate": 5.630744638337551e-07, |
| "loss": 83.4683, |
| "step": 4115 |
| }, |
| { |
| "epoch": 0.4938715574335461, |
| "grad_norm": 101.3125, |
| "learning_rate": 5.624084188091115e-07, |
| "loss": 87.2967, |
| "step": 4120 |
| }, |
| { |
| "epoch": 0.49447091611975186, |
| "grad_norm": 103.875, |
| "learning_rate": 5.617423737844677e-07, |
| "loss": 84.2752, |
| "step": 4125 |
| }, |
| { |
| "epoch": 0.4950702748059576, |
| "grad_norm": 106.375, |
| "learning_rate": 5.610763287598241e-07, |
| "loss": 84.2686, |
| "step": 4130 |
| }, |
| { |
| "epoch": 0.49566963349216336, |
| "grad_norm": 103.375, |
| "learning_rate": 5.604102837351805e-07, |
| "loss": 82.353, |
| "step": 4135 |
| }, |
| { |
| "epoch": 0.4962689921783692, |
| "grad_norm": 101.625, |
| "learning_rate": 5.597442387105367e-07, |
| "loss": 82.047, |
| "step": 4140 |
| }, |
| { |
| "epoch": 0.4968683508645749, |
| "grad_norm": 106.75, |
| "learning_rate": 5.590781936858931e-07, |
| "loss": 83.8074, |
| "step": 4145 |
| }, |
| { |
| "epoch": 0.4974677095507807, |
| "grad_norm": 104.3125, |
| "learning_rate": 5.584121486612496e-07, |
| "loss": 83.597, |
| "step": 4150 |
| }, |
| { |
| "epoch": 0.49806706823698643, |
| "grad_norm": 111.5625, |
| "learning_rate": 5.577461036366057e-07, |
| "loss": 84.8574, |
| "step": 4155 |
| }, |
| { |
| "epoch": 0.4986664269231922, |
| "grad_norm": 105.375, |
| "learning_rate": 5.570800586119621e-07, |
| "loss": 83.1505, |
| "step": 4160 |
| }, |
| { |
| "epoch": 0.49926578560939794, |
| "grad_norm": 105.6875, |
| "learning_rate": 5.564140135873184e-07, |
| "loss": 84.2874, |
| "step": 4165 |
| }, |
| { |
| "epoch": 0.4998651442956037, |
| "grad_norm": 104.625, |
| "learning_rate": 5.557479685626749e-07, |
| "loss": 82.8634, |
| "step": 4170 |
| }, |
| { |
| "epoch": 0.5004645029818094, |
| "grad_norm": 104.5, |
| "learning_rate": 5.550819235380312e-07, |
| "loss": 82.2823, |
| "step": 4175 |
| }, |
| { |
| "epoch": 0.5010638616680152, |
| "grad_norm": 104.9375, |
| "learning_rate": 5.544158785133875e-07, |
| "loss": 83.2534, |
| "step": 4180 |
| }, |
| { |
| "epoch": 0.501663220354221, |
| "grad_norm": 103.4375, |
| "learning_rate": 5.537498334887439e-07, |
| "loss": 82.9911, |
| "step": 4185 |
| }, |
| { |
| "epoch": 0.5022625790404267, |
| "grad_norm": 106.4375, |
| "learning_rate": 5.530837884641002e-07, |
| "loss": 82.9007, |
| "step": 4190 |
| }, |
| { |
| "epoch": 0.5028619377266325, |
| "grad_norm": 104.25, |
| "learning_rate": 5.524177434394565e-07, |
| "loss": 83.3259, |
| "step": 4195 |
| }, |
| { |
| "epoch": 0.5034612964128382, |
| "grad_norm": 105.1875, |
| "learning_rate": 5.517516984148129e-07, |
| "loss": 83.7739, |
| "step": 4200 |
| }, |
| { |
| "epoch": 0.5040606550990441, |
| "grad_norm": 103.5625, |
| "learning_rate": 5.51085653390169e-07, |
| "loss": 83.3372, |
| "step": 4205 |
| }, |
| { |
| "epoch": 0.5046600137852498, |
| "grad_norm": 106.875, |
| "learning_rate": 5.504196083655255e-07, |
| "loss": 83.7161, |
| "step": 4210 |
| }, |
| { |
| "epoch": 0.5052593724714556, |
| "grad_norm": 105.25, |
| "learning_rate": 5.497535633408819e-07, |
| "loss": 82.8205, |
| "step": 4215 |
| }, |
| { |
| "epoch": 0.5058587311576613, |
| "grad_norm": 102.4375, |
| "learning_rate": 5.490875183162382e-07, |
| "loss": 83.3096, |
| "step": 4220 |
| }, |
| { |
| "epoch": 0.5064580898438671, |
| "grad_norm": 102.4375, |
| "learning_rate": 5.484214732915945e-07, |
| "loss": 84.2793, |
| "step": 4225 |
| }, |
| { |
| "epoch": 0.5070574485300728, |
| "grad_norm": 106.875, |
| "learning_rate": 5.477554282669509e-07, |
| "loss": 83.4831, |
| "step": 4230 |
| }, |
| { |
| "epoch": 0.5076568072162786, |
| "grad_norm": 105.25, |
| "learning_rate": 5.470893832423072e-07, |
| "loss": 85.2621, |
| "step": 4235 |
| }, |
| { |
| "epoch": 0.5082561659024843, |
| "grad_norm": 106.5625, |
| "learning_rate": 5.464233382176635e-07, |
| "loss": 82.8462, |
| "step": 4240 |
| }, |
| { |
| "epoch": 0.5088555245886901, |
| "grad_norm": 102.6875, |
| "learning_rate": 5.457572931930198e-07, |
| "loss": 83.4198, |
| "step": 4245 |
| }, |
| { |
| "epoch": 0.5094548832748959, |
| "grad_norm": 104.5, |
| "learning_rate": 5.450912481683762e-07, |
| "loss": 83.3093, |
| "step": 4250 |
| }, |
| { |
| "epoch": 0.5100542419611016, |
| "grad_norm": 107.8125, |
| "learning_rate": 5.444252031437326e-07, |
| "loss": 83.5302, |
| "step": 4255 |
| }, |
| { |
| "epoch": 0.5106536006473074, |
| "grad_norm": 109.875, |
| "learning_rate": 5.437591581190888e-07, |
| "loss": 83.8509, |
| "step": 4260 |
| }, |
| { |
| "epoch": 0.5112529593335131, |
| "grad_norm": 105.5, |
| "learning_rate": 5.430931130944452e-07, |
| "loss": 83.2915, |
| "step": 4265 |
| }, |
| { |
| "epoch": 0.5118523180197189, |
| "grad_norm": 105.0625, |
| "learning_rate": 5.424270680698016e-07, |
| "loss": 84.0008, |
| "step": 4270 |
| }, |
| { |
| "epoch": 0.5124516767059246, |
| "grad_norm": 102.4375, |
| "learning_rate": 5.417610230451578e-07, |
| "loss": 82.4604, |
| "step": 4275 |
| }, |
| { |
| "epoch": 0.5130510353921304, |
| "grad_norm": 103.9375, |
| "learning_rate": 5.410949780205142e-07, |
| "loss": 82.8647, |
| "step": 4280 |
| }, |
| { |
| "epoch": 0.5136503940783361, |
| "grad_norm": 104.5625, |
| "learning_rate": 5.404289329958705e-07, |
| "loss": 83.5644, |
| "step": 4285 |
| }, |
| { |
| "epoch": 0.514249752764542, |
| "grad_norm": 106.25, |
| "learning_rate": 5.397628879712268e-07, |
| "loss": 83.7008, |
| "step": 4290 |
| }, |
| { |
| "epoch": 0.5148491114507477, |
| "grad_norm": 102.875, |
| "learning_rate": 5.390968429465832e-07, |
| "loss": 83.0174, |
| "step": 4295 |
| }, |
| { |
| "epoch": 0.5154484701369535, |
| "grad_norm": 105.875, |
| "learning_rate": 5.384307979219395e-07, |
| "loss": 84.0506, |
| "step": 4300 |
| }, |
| { |
| "epoch": 0.5160478288231592, |
| "grad_norm": 104.375, |
| "learning_rate": 5.377647528972959e-07, |
| "loss": 82.8537, |
| "step": 4305 |
| }, |
| { |
| "epoch": 0.516647187509365, |
| "grad_norm": 108.5625, |
| "learning_rate": 5.370987078726522e-07, |
| "loss": 84.2384, |
| "step": 4310 |
| }, |
| { |
| "epoch": 0.5172465461955708, |
| "grad_norm": 104.4375, |
| "learning_rate": 5.364326628480085e-07, |
| "loss": 82.8151, |
| "step": 4315 |
| }, |
| { |
| "epoch": 0.5178459048817765, |
| "grad_norm": 102.25, |
| "learning_rate": 5.357666178233649e-07, |
| "loss": 83.6291, |
| "step": 4320 |
| }, |
| { |
| "epoch": 0.5184452635679823, |
| "grad_norm": 101.875, |
| "learning_rate": 5.351005727987211e-07, |
| "loss": 82.3904, |
| "step": 4325 |
| }, |
| { |
| "epoch": 0.519044622254188, |
| "grad_norm": 103.125, |
| "learning_rate": 5.344345277740775e-07, |
| "loss": 84.4222, |
| "step": 4330 |
| }, |
| { |
| "epoch": 0.5196439809403938, |
| "grad_norm": 103.75, |
| "learning_rate": 5.337684827494339e-07, |
| "loss": 83.1161, |
| "step": 4335 |
| }, |
| { |
| "epoch": 0.5202433396265995, |
| "grad_norm": 108.1875, |
| "learning_rate": 5.331024377247901e-07, |
| "loss": 82.2274, |
| "step": 4340 |
| }, |
| { |
| "epoch": 0.5208426983128053, |
| "grad_norm": 103.375, |
| "learning_rate": 5.324363927001465e-07, |
| "loss": 81.779, |
| "step": 4345 |
| }, |
| { |
| "epoch": 0.521442056999011, |
| "grad_norm": 104.0, |
| "learning_rate": 5.317703476755029e-07, |
| "loss": 83.6576, |
| "step": 4350 |
| }, |
| { |
| "epoch": 0.5220414156852168, |
| "grad_norm": 107.0625, |
| "learning_rate": 5.311043026508592e-07, |
| "loss": 84.1208, |
| "step": 4355 |
| }, |
| { |
| "epoch": 0.5226407743714225, |
| "grad_norm": 105.875, |
| "learning_rate": 5.304382576262155e-07, |
| "loss": 83.0397, |
| "step": 4360 |
| }, |
| { |
| "epoch": 0.5232401330576283, |
| "grad_norm": 103.375, |
| "learning_rate": 5.297722126015718e-07, |
| "loss": 83.5394, |
| "step": 4365 |
| }, |
| { |
| "epoch": 0.5238394917438342, |
| "grad_norm": 108.0625, |
| "learning_rate": 5.291061675769282e-07, |
| "loss": 82.6936, |
| "step": 4370 |
| }, |
| { |
| "epoch": 0.5244388504300399, |
| "grad_norm": 105.125, |
| "learning_rate": 5.284401225522845e-07, |
| "loss": 83.234, |
| "step": 4375 |
| }, |
| { |
| "epoch": 0.5250382091162457, |
| "grad_norm": 105.3125, |
| "learning_rate": 5.277740775276408e-07, |
| "loss": 82.665, |
| "step": 4380 |
| }, |
| { |
| "epoch": 0.5256375678024514, |
| "grad_norm": 103.9375, |
| "learning_rate": 5.271080325029972e-07, |
| "loss": 81.9667, |
| "step": 4385 |
| }, |
| { |
| "epoch": 0.5262369264886572, |
| "grad_norm": 102.25, |
| "learning_rate": 5.264419874783536e-07, |
| "loss": 81.5035, |
| "step": 4390 |
| }, |
| { |
| "epoch": 0.5268362851748629, |
| "grad_norm": 104.625, |
| "learning_rate": 5.257759424537098e-07, |
| "loss": 84.3656, |
| "step": 4395 |
| }, |
| { |
| "epoch": 0.5274356438610687, |
| "grad_norm": 104.0, |
| "learning_rate": 5.251098974290662e-07, |
| "loss": 83.848, |
| "step": 4400 |
| }, |
| { |
| "epoch": 0.5280350025472744, |
| "grad_norm": 107.4375, |
| "learning_rate": 5.244438524044225e-07, |
| "loss": 82.2172, |
| "step": 4405 |
| }, |
| { |
| "epoch": 0.5286343612334802, |
| "grad_norm": 104.875, |
| "learning_rate": 5.237778073797788e-07, |
| "loss": 81.9961, |
| "step": 4410 |
| }, |
| { |
| "epoch": 0.5292337199196859, |
| "grad_norm": 103.9375, |
| "learning_rate": 5.231117623551352e-07, |
| "loss": 83.5917, |
| "step": 4415 |
| }, |
| { |
| "epoch": 0.5298330786058917, |
| "grad_norm": 106.375, |
| "learning_rate": 5.224457173304915e-07, |
| "loss": 83.7527, |
| "step": 4420 |
| }, |
| { |
| "epoch": 0.5304324372920974, |
| "grad_norm": 101.9375, |
| "learning_rate": 5.217796723058478e-07, |
| "loss": 83.1019, |
| "step": 4425 |
| }, |
| { |
| "epoch": 0.5310317959783032, |
| "grad_norm": 107.6875, |
| "learning_rate": 5.211136272812042e-07, |
| "loss": 82.3363, |
| "step": 4430 |
| }, |
| { |
| "epoch": 0.5316311546645089, |
| "grad_norm": 106.125, |
| "learning_rate": 5.204475822565605e-07, |
| "loss": 83.6656, |
| "step": 4435 |
| }, |
| { |
| "epoch": 0.5322305133507147, |
| "grad_norm": 103.375, |
| "learning_rate": 5.197815372319169e-07, |
| "loss": 84.1844, |
| "step": 4440 |
| }, |
| { |
| "epoch": 0.5328298720369204, |
| "grad_norm": 104.5, |
| "learning_rate": 5.191154922072732e-07, |
| "loss": 82.8866, |
| "step": 4445 |
| }, |
| { |
| "epoch": 0.5334292307231262, |
| "grad_norm": 104.5, |
| "learning_rate": 5.184494471826295e-07, |
| "loss": 82.2072, |
| "step": 4450 |
| }, |
| { |
| "epoch": 0.5340285894093321, |
| "grad_norm": 101.6875, |
| "learning_rate": 5.177834021579859e-07, |
| "loss": 83.5113, |
| "step": 4455 |
| }, |
| { |
| "epoch": 0.5346279480955378, |
| "grad_norm": 105.3125, |
| "learning_rate": 5.171173571333421e-07, |
| "loss": 83.4531, |
| "step": 4460 |
| }, |
| { |
| "epoch": 0.5352273067817436, |
| "grad_norm": 104.375, |
| "learning_rate": 5.164513121086985e-07, |
| "loss": 81.8829, |
| "step": 4465 |
| }, |
| { |
| "epoch": 0.5358266654679493, |
| "grad_norm": 104.3125, |
| "learning_rate": 5.157852670840549e-07, |
| "loss": 83.2843, |
| "step": 4470 |
| }, |
| { |
| "epoch": 0.5364260241541551, |
| "grad_norm": 104.25, |
| "learning_rate": 5.151192220594112e-07, |
| "loss": 82.3948, |
| "step": 4475 |
| }, |
| { |
| "epoch": 0.5370253828403608, |
| "grad_norm": 102.1875, |
| "learning_rate": 5.144531770347675e-07, |
| "loss": 82.8727, |
| "step": 4480 |
| }, |
| { |
| "epoch": 0.5376247415265666, |
| "grad_norm": 103.0625, |
| "learning_rate": 5.137871320101239e-07, |
| "loss": 82.7623, |
| "step": 4485 |
| }, |
| { |
| "epoch": 0.5382241002127723, |
| "grad_norm": 102.1875, |
| "learning_rate": 5.131210869854802e-07, |
| "loss": 81.87, |
| "step": 4490 |
| }, |
| { |
| "epoch": 0.5388234588989781, |
| "grad_norm": 103.875, |
| "learning_rate": 5.124550419608365e-07, |
| "loss": 82.4659, |
| "step": 4495 |
| }, |
| { |
| "epoch": 0.5394228175851838, |
| "grad_norm": 106.3125, |
| "learning_rate": 5.117889969361928e-07, |
| "loss": 81.6633, |
| "step": 4500 |
| }, |
| { |
| "epoch": 0.5394228175851838, |
| "eval_loss": 2.588843584060669, |
| "eval_runtime": 398.8047, |
| "eval_samples_per_second": 1127.376, |
| "eval_steps_per_second": 35.233, |
| "step": 4500 |
| }, |
| { |
| "epoch": 0.5400221762713896, |
| "grad_norm": 101.25, |
| "learning_rate": 5.111229519115492e-07, |
| "loss": 81.4664, |
| "step": 4505 |
| }, |
| { |
| "epoch": 0.5406215349575954, |
| "grad_norm": 104.1875, |
| "learning_rate": 5.104569068869055e-07, |
| "loss": 82.9599, |
| "step": 4510 |
| }, |
| { |
| "epoch": 0.5412208936438011, |
| "grad_norm": 106.6875, |
| "learning_rate": 5.097908618622618e-07, |
| "loss": 82.806, |
| "step": 4515 |
| }, |
| { |
| "epoch": 0.5418202523300069, |
| "grad_norm": 105.0, |
| "learning_rate": 5.091248168376182e-07, |
| "loss": 82.4233, |
| "step": 4520 |
| }, |
| { |
| "epoch": 0.5424196110162126, |
| "grad_norm": 102.8125, |
| "learning_rate": 5.084587718129746e-07, |
| "loss": 84.2593, |
| "step": 4525 |
| }, |
| { |
| "epoch": 0.5430189697024184, |
| "grad_norm": 108.5625, |
| "learning_rate": 5.077927267883308e-07, |
| "loss": 83.7284, |
| "step": 4530 |
| }, |
| { |
| "epoch": 0.5436183283886241, |
| "grad_norm": 105.375, |
| "learning_rate": 5.071266817636872e-07, |
| "loss": 82.6245, |
| "step": 4535 |
| }, |
| { |
| "epoch": 0.54421768707483, |
| "grad_norm": 105.4375, |
| "learning_rate": 5.064606367390435e-07, |
| "loss": 82.7108, |
| "step": 4540 |
| }, |
| { |
| "epoch": 0.5448170457610357, |
| "grad_norm": 104.875, |
| "learning_rate": 5.057945917143998e-07, |
| "loss": 81.2744, |
| "step": 4545 |
| }, |
| { |
| "epoch": 0.5454164044472415, |
| "grad_norm": 104.0625, |
| "learning_rate": 5.051285466897562e-07, |
| "loss": 81.4598, |
| "step": 4550 |
| }, |
| { |
| "epoch": 0.5460157631334472, |
| "grad_norm": 105.75, |
| "learning_rate": 5.044625016651125e-07, |
| "loss": 82.4933, |
| "step": 4555 |
| }, |
| { |
| "epoch": 0.546615121819653, |
| "grad_norm": 106.0, |
| "learning_rate": 5.037964566404688e-07, |
| "loss": 83.762, |
| "step": 4560 |
| }, |
| { |
| "epoch": 0.5472144805058587, |
| "grad_norm": 104.0625, |
| "learning_rate": 5.031304116158252e-07, |
| "loss": 81.7877, |
| "step": 4565 |
| }, |
| { |
| "epoch": 0.5478138391920645, |
| "grad_norm": 107.375, |
| "learning_rate": 5.024643665911815e-07, |
| "loss": 83.3971, |
| "step": 4570 |
| }, |
| { |
| "epoch": 0.5484131978782703, |
| "grad_norm": 106.0625, |
| "learning_rate": 5.017983215665379e-07, |
| "loss": 82.3262, |
| "step": 4575 |
| }, |
| { |
| "epoch": 0.549012556564476, |
| "grad_norm": 100.8125, |
| "learning_rate": 5.011322765418941e-07, |
| "loss": 83.2061, |
| "step": 4580 |
| }, |
| { |
| "epoch": 0.5496119152506818, |
| "grad_norm": 105.8125, |
| "learning_rate": 5.004662315172505e-07, |
| "loss": 81.5443, |
| "step": 4585 |
| }, |
| { |
| "epoch": 0.5502112739368875, |
| "grad_norm": 104.0, |
| "learning_rate": 4.998001864926068e-07, |
| "loss": 82.7867, |
| "step": 4590 |
| }, |
| { |
| "epoch": 0.5508106326230933, |
| "grad_norm": 106.0, |
| "learning_rate": 4.991341414679632e-07, |
| "loss": 82.5244, |
| "step": 4595 |
| }, |
| { |
| "epoch": 0.551409991309299, |
| "grad_norm": 106.625, |
| "learning_rate": 4.984680964433195e-07, |
| "loss": 82.5752, |
| "step": 4600 |
| }, |
| { |
| "epoch": 0.5520093499955048, |
| "grad_norm": 106.25, |
| "learning_rate": 4.978020514186758e-07, |
| "loss": 83.1504, |
| "step": 4605 |
| }, |
| { |
| "epoch": 0.5526087086817105, |
| "grad_norm": 106.375, |
| "learning_rate": 4.971360063940322e-07, |
| "loss": 81.3867, |
| "step": 4610 |
| }, |
| { |
| "epoch": 0.5532080673679163, |
| "grad_norm": 105.25, |
| "learning_rate": 4.964699613693885e-07, |
| "loss": 83.0015, |
| "step": 4615 |
| }, |
| { |
| "epoch": 0.5538074260541221, |
| "grad_norm": 104.0625, |
| "learning_rate": 4.958039163447448e-07, |
| "loss": 81.9788, |
| "step": 4620 |
| }, |
| { |
| "epoch": 0.5544067847403279, |
| "grad_norm": 101.625, |
| "learning_rate": 4.951378713201012e-07, |
| "loss": 81.9656, |
| "step": 4625 |
| }, |
| { |
| "epoch": 0.5550061434265336, |
| "grad_norm": 104.1875, |
| "learning_rate": 4.944718262954575e-07, |
| "loss": 80.6905, |
| "step": 4630 |
| }, |
| { |
| "epoch": 0.5556055021127394, |
| "grad_norm": 105.125, |
| "learning_rate": 4.93805781270814e-07, |
| "loss": 82.6242, |
| "step": 4635 |
| }, |
| { |
| "epoch": 0.5562048607989452, |
| "grad_norm": 105.125, |
| "learning_rate": 4.931397362461702e-07, |
| "loss": 81.465, |
| "step": 4640 |
| }, |
| { |
| "epoch": 0.5568042194851509, |
| "grad_norm": 103.9375, |
| "learning_rate": 4.924736912215265e-07, |
| "loss": 82.8664, |
| "step": 4645 |
| }, |
| { |
| "epoch": 0.5574035781713567, |
| "grad_norm": 105.25, |
| "learning_rate": 4.918076461968828e-07, |
| "loss": 83.367, |
| "step": 4650 |
| }, |
| { |
| "epoch": 0.5580029368575624, |
| "grad_norm": 106.125, |
| "learning_rate": 4.911416011722393e-07, |
| "loss": 81.668, |
| "step": 4655 |
| }, |
| { |
| "epoch": 0.5586022955437682, |
| "grad_norm": 104.875, |
| "learning_rate": 4.904755561475956e-07, |
| "loss": 82.4106, |
| "step": 4660 |
| }, |
| { |
| "epoch": 0.5592016542299739, |
| "grad_norm": 106.25, |
| "learning_rate": 4.898095111229519e-07, |
| "loss": 83.5437, |
| "step": 4665 |
| }, |
| { |
| "epoch": 0.5598010129161797, |
| "grad_norm": 106.8125, |
| "learning_rate": 4.891434660983082e-07, |
| "loss": 81.4972, |
| "step": 4670 |
| }, |
| { |
| "epoch": 0.5604003716023854, |
| "grad_norm": 108.75, |
| "learning_rate": 4.884774210736646e-07, |
| "loss": 81.9168, |
| "step": 4675 |
| }, |
| { |
| "epoch": 0.5609997302885912, |
| "grad_norm": 105.5625, |
| "learning_rate": 4.878113760490209e-07, |
| "loss": 83.4338, |
| "step": 4680 |
| }, |
| { |
| "epoch": 0.5615990889747969, |
| "grad_norm": 99.9375, |
| "learning_rate": 4.871453310243773e-07, |
| "loss": 81.1651, |
| "step": 4685 |
| }, |
| { |
| "epoch": 0.5621984476610027, |
| "grad_norm": 106.6875, |
| "learning_rate": 4.864792859997336e-07, |
| "loss": 82.8396, |
| "step": 4690 |
| }, |
| { |
| "epoch": 0.5627978063472084, |
| "grad_norm": 101.0, |
| "learning_rate": 4.8581324097509e-07, |
| "loss": 83.3178, |
| "step": 4695 |
| }, |
| { |
| "epoch": 0.5633971650334142, |
| "grad_norm": 105.5, |
| "learning_rate": 4.851471959504463e-07, |
| "loss": 82.1475, |
| "step": 4700 |
| }, |
| { |
| "epoch": 0.5639965237196201, |
| "grad_norm": 104.75, |
| "learning_rate": 4.844811509258026e-07, |
| "loss": 81.1286, |
| "step": 4705 |
| }, |
| { |
| "epoch": 0.5645958824058258, |
| "grad_norm": 104.6875, |
| "learning_rate": 4.838151059011589e-07, |
| "loss": 82.0347, |
| "step": 4710 |
| }, |
| { |
| "epoch": 0.5651952410920316, |
| "grad_norm": 103.875, |
| "learning_rate": 4.831490608765153e-07, |
| "loss": 82.6172, |
| "step": 4715 |
| }, |
| { |
| "epoch": 0.5657945997782373, |
| "grad_norm": 104.5, |
| "learning_rate": 4.824830158518716e-07, |
| "loss": 81.2462, |
| "step": 4720 |
| }, |
| { |
| "epoch": 0.5663939584644431, |
| "grad_norm": 104.0, |
| "learning_rate": 4.818169708272279e-07, |
| "loss": 81.1761, |
| "step": 4725 |
| }, |
| { |
| "epoch": 0.5669933171506488, |
| "grad_norm": 105.3125, |
| "learning_rate": 4.811509258025842e-07, |
| "loss": 83.4811, |
| "step": 4730 |
| }, |
| { |
| "epoch": 0.5675926758368546, |
| "grad_norm": 104.1875, |
| "learning_rate": 4.804848807779406e-07, |
| "loss": 82.4757, |
| "step": 4735 |
| }, |
| { |
| "epoch": 0.5681920345230603, |
| "grad_norm": 107.3125, |
| "learning_rate": 4.798188357532969e-07, |
| "loss": 80.6124, |
| "step": 4740 |
| }, |
| { |
| "epoch": 0.5687913932092661, |
| "grad_norm": 104.75, |
| "learning_rate": 4.791527907286533e-07, |
| "loss": 82.6368, |
| "step": 4745 |
| }, |
| { |
| "epoch": 0.5693907518954718, |
| "grad_norm": 104.75, |
| "learning_rate": 4.784867457040096e-07, |
| "loss": 81.416, |
| "step": 4750 |
| }, |
| { |
| "epoch": 0.5699901105816776, |
| "grad_norm": 104.125, |
| "learning_rate": 4.778207006793659e-07, |
| "loss": 81.6732, |
| "step": 4755 |
| }, |
| { |
| "epoch": 0.5705894692678833, |
| "grad_norm": 103.1875, |
| "learning_rate": 4.771546556547223e-07, |
| "loss": 80.9805, |
| "step": 4760 |
| }, |
| { |
| "epoch": 0.5711888279540891, |
| "grad_norm": 102.8125, |
| "learning_rate": 4.764886106300786e-07, |
| "loss": 80.7926, |
| "step": 4765 |
| }, |
| { |
| "epoch": 0.5717881866402948, |
| "grad_norm": 106.0625, |
| "learning_rate": 4.758225656054349e-07, |
| "loss": 82.4188, |
| "step": 4770 |
| }, |
| { |
| "epoch": 0.5723875453265006, |
| "grad_norm": 104.6875, |
| "learning_rate": 4.751565205807913e-07, |
| "loss": 81.5341, |
| "step": 4775 |
| }, |
| { |
| "epoch": 0.5729869040127064, |
| "grad_norm": 106.6875, |
| "learning_rate": 4.744904755561476e-07, |
| "loss": 80.7328, |
| "step": 4780 |
| }, |
| { |
| "epoch": 0.5735862626989122, |
| "grad_norm": 105.5625, |
| "learning_rate": 4.738244305315039e-07, |
| "loss": 81.8097, |
| "step": 4785 |
| }, |
| { |
| "epoch": 0.574185621385118, |
| "grad_norm": 105.875, |
| "learning_rate": 4.731583855068603e-07, |
| "loss": 81.057, |
| "step": 4790 |
| }, |
| { |
| "epoch": 0.5747849800713237, |
| "grad_norm": 104.5625, |
| "learning_rate": 4.724923404822166e-07, |
| "loss": 82.8115, |
| "step": 4795 |
| }, |
| { |
| "epoch": 0.5753843387575295, |
| "grad_norm": 105.9375, |
| "learning_rate": 4.7182629545757293e-07, |
| "loss": 81.8745, |
| "step": 4800 |
| }, |
| { |
| "epoch": 0.5759836974437352, |
| "grad_norm": 107.9375, |
| "learning_rate": 4.7116025043292923e-07, |
| "loss": 81.5439, |
| "step": 4805 |
| }, |
| { |
| "epoch": 0.576583056129941, |
| "grad_norm": 104.0625, |
| "learning_rate": 4.704942054082856e-07, |
| "loss": 82.1254, |
| "step": 4810 |
| }, |
| { |
| "epoch": 0.5771824148161467, |
| "grad_norm": 103.6875, |
| "learning_rate": 4.6982816038364194e-07, |
| "loss": 81.1297, |
| "step": 4815 |
| }, |
| { |
| "epoch": 0.5777817735023525, |
| "grad_norm": 104.1875, |
| "learning_rate": 4.6916211535899823e-07, |
| "loss": 82.4339, |
| "step": 4820 |
| }, |
| { |
| "epoch": 0.5783811321885582, |
| "grad_norm": 104.125, |
| "learning_rate": 4.684960703343546e-07, |
| "loss": 81.5759, |
| "step": 4825 |
| }, |
| { |
| "epoch": 0.578980490874764, |
| "grad_norm": 106.5, |
| "learning_rate": 4.6783002530971094e-07, |
| "loss": 81.4303, |
| "step": 4830 |
| }, |
| { |
| "epoch": 0.5795798495609698, |
| "grad_norm": 102.75, |
| "learning_rate": 4.6716398028506724e-07, |
| "loss": 80.9401, |
| "step": 4835 |
| }, |
| { |
| "epoch": 0.5801792082471755, |
| "grad_norm": 106.0, |
| "learning_rate": 4.664979352604236e-07, |
| "loss": 82.6096, |
| "step": 4840 |
| }, |
| { |
| "epoch": 0.5807785669333813, |
| "grad_norm": 103.5, |
| "learning_rate": 4.658318902357799e-07, |
| "loss": 80.9871, |
| "step": 4845 |
| }, |
| { |
| "epoch": 0.581377925619587, |
| "grad_norm": 108.1875, |
| "learning_rate": 4.651658452111363e-07, |
| "loss": 82.111, |
| "step": 4850 |
| }, |
| { |
| "epoch": 0.5819772843057928, |
| "grad_norm": 103.375, |
| "learning_rate": 4.644998001864926e-07, |
| "loss": 81.8643, |
| "step": 4855 |
| }, |
| { |
| "epoch": 0.5825766429919985, |
| "grad_norm": 105.5, |
| "learning_rate": 4.638337551618489e-07, |
| "loss": 81.4741, |
| "step": 4860 |
| }, |
| { |
| "epoch": 0.5831760016782043, |
| "grad_norm": 107.25, |
| "learning_rate": 4.6316771013720524e-07, |
| "loss": 82.0387, |
| "step": 4865 |
| }, |
| { |
| "epoch": 0.5837753603644101, |
| "grad_norm": 103.9375, |
| "learning_rate": 4.625016651125616e-07, |
| "loss": 81.4705, |
| "step": 4870 |
| }, |
| { |
| "epoch": 0.5843747190506159, |
| "grad_norm": 105.125, |
| "learning_rate": 4.6183562008791795e-07, |
| "loss": 81.175, |
| "step": 4875 |
| }, |
| { |
| "epoch": 0.5849740777368216, |
| "grad_norm": 103.0625, |
| "learning_rate": 4.6116957506327425e-07, |
| "loss": 81.0439, |
| "step": 4880 |
| }, |
| { |
| "epoch": 0.5855734364230274, |
| "grad_norm": 107.4375, |
| "learning_rate": 4.6050353003863055e-07, |
| "loss": 80.9275, |
| "step": 4885 |
| }, |
| { |
| "epoch": 0.5861727951092331, |
| "grad_norm": 102.9375, |
| "learning_rate": 4.5983748501398695e-07, |
| "loss": 82.4049, |
| "step": 4890 |
| }, |
| { |
| "epoch": 0.5867721537954389, |
| "grad_norm": 106.125, |
| "learning_rate": 4.5917143998934325e-07, |
| "loss": 82.7499, |
| "step": 4895 |
| }, |
| { |
| "epoch": 0.5873715124816447, |
| "grad_norm": 104.3125, |
| "learning_rate": 4.585053949646996e-07, |
| "loss": 82.2027, |
| "step": 4900 |
| }, |
| { |
| "epoch": 0.5879708711678504, |
| "grad_norm": 106.5, |
| "learning_rate": 4.578393499400559e-07, |
| "loss": 82.3243, |
| "step": 4905 |
| }, |
| { |
| "epoch": 0.5885702298540562, |
| "grad_norm": 106.0625, |
| "learning_rate": 4.571733049154123e-07, |
| "loss": 82.9383, |
| "step": 4910 |
| }, |
| { |
| "epoch": 0.5891695885402619, |
| "grad_norm": 107.3125, |
| "learning_rate": 4.565072598907686e-07, |
| "loss": 81.5135, |
| "step": 4915 |
| }, |
| { |
| "epoch": 0.5897689472264677, |
| "grad_norm": 103.8125, |
| "learning_rate": 4.558412148661249e-07, |
| "loss": 82.6818, |
| "step": 4920 |
| }, |
| { |
| "epoch": 0.5903683059126734, |
| "grad_norm": 104.6875, |
| "learning_rate": 4.5517516984148126e-07, |
| "loss": 81.2648, |
| "step": 4925 |
| }, |
| { |
| "epoch": 0.5909676645988792, |
| "grad_norm": 101.875, |
| "learning_rate": 4.545091248168376e-07, |
| "loss": 82.1621, |
| "step": 4930 |
| }, |
| { |
| "epoch": 0.5915670232850849, |
| "grad_norm": 105.9375, |
| "learning_rate": 4.5384307979219396e-07, |
| "loss": 82.1844, |
| "step": 4935 |
| }, |
| { |
| "epoch": 0.5921663819712907, |
| "grad_norm": 104.4375, |
| "learning_rate": 4.5317703476755026e-07, |
| "loss": 81.5659, |
| "step": 4940 |
| }, |
| { |
| "epoch": 0.5927657406574964, |
| "grad_norm": 106.25, |
| "learning_rate": 4.5251098974290656e-07, |
| "loss": 82.0492, |
| "step": 4945 |
| }, |
| { |
| "epoch": 0.5933650993437022, |
| "grad_norm": 109.0625, |
| "learning_rate": 4.5184494471826296e-07, |
| "loss": 81.1946, |
| "step": 4950 |
| }, |
| { |
| "epoch": 0.593964458029908, |
| "grad_norm": 105.875, |
| "learning_rate": 4.5117889969361926e-07, |
| "loss": 82.4996, |
| "step": 4955 |
| }, |
| { |
| "epoch": 0.5945638167161138, |
| "grad_norm": 105.125, |
| "learning_rate": 4.505128546689756e-07, |
| "loss": 81.4744, |
| "step": 4960 |
| }, |
| { |
| "epoch": 0.5951631754023196, |
| "grad_norm": 106.3125, |
| "learning_rate": 4.498468096443319e-07, |
| "loss": 80.7131, |
| "step": 4965 |
| }, |
| { |
| "epoch": 0.5957625340885253, |
| "grad_norm": 111.3125, |
| "learning_rate": 4.4918076461968827e-07, |
| "loss": 82.425, |
| "step": 4970 |
| }, |
| { |
| "epoch": 0.5963618927747311, |
| "grad_norm": 108.0, |
| "learning_rate": 4.485147195950446e-07, |
| "loss": 79.8071, |
| "step": 4975 |
| }, |
| { |
| "epoch": 0.5969612514609368, |
| "grad_norm": 103.0625, |
| "learning_rate": 4.478486745704009e-07, |
| "loss": 82.7906, |
| "step": 4980 |
| }, |
| { |
| "epoch": 0.5975606101471426, |
| "grad_norm": 105.1875, |
| "learning_rate": 4.4718262954575727e-07, |
| "loss": 81.2079, |
| "step": 4985 |
| }, |
| { |
| "epoch": 0.5981599688333483, |
| "grad_norm": 107.25, |
| "learning_rate": 4.465165845211136e-07, |
| "loss": 80.6633, |
| "step": 4990 |
| }, |
| { |
| "epoch": 0.5987593275195541, |
| "grad_norm": 105.0, |
| "learning_rate": 4.458505394964699e-07, |
| "loss": 82.2591, |
| "step": 4995 |
| }, |
| { |
| "epoch": 0.5993586862057598, |
| "grad_norm": 101.875, |
| "learning_rate": 4.4518449447182627e-07, |
| "loss": 81.2498, |
| "step": 5000 |
| }, |
| { |
| "epoch": 0.5993586862057598, |
| "eval_loss": 2.542724609375, |
| "eval_runtime": 401.2295, |
| "eval_samples_per_second": 1120.563, |
| "eval_steps_per_second": 35.02, |
| "step": 5000 |
| }, |
| { |
| "epoch": 0.5999580448919656, |
| "grad_norm": 107.0625, |
| "learning_rate": 4.4451844944718257e-07, |
| "loss": 81.491, |
| "step": 5005 |
| }, |
| { |
| "epoch": 0.6005574035781713, |
| "grad_norm": 106.5625, |
| "learning_rate": 4.43852404422539e-07, |
| "loss": 80.9435, |
| "step": 5010 |
| }, |
| { |
| "epoch": 0.6011567622643771, |
| "grad_norm": 106.0625, |
| "learning_rate": 4.431863593978953e-07, |
| "loss": 80.1556, |
| "step": 5015 |
| }, |
| { |
| "epoch": 0.6017561209505828, |
| "grad_norm": 104.5625, |
| "learning_rate": 4.4252031437325163e-07, |
| "loss": 81.2597, |
| "step": 5020 |
| }, |
| { |
| "epoch": 0.6023554796367886, |
| "grad_norm": 103.0625, |
| "learning_rate": 4.4185426934860793e-07, |
| "loss": 80.4449, |
| "step": 5025 |
| }, |
| { |
| "epoch": 0.6029548383229943, |
| "grad_norm": 105.4375, |
| "learning_rate": 4.411882243239643e-07, |
| "loss": 79.8876, |
| "step": 5030 |
| }, |
| { |
| "epoch": 0.6035541970092002, |
| "grad_norm": 102.8125, |
| "learning_rate": 4.4052217929932063e-07, |
| "loss": 81.1896, |
| "step": 5035 |
| }, |
| { |
| "epoch": 0.604153555695406, |
| "grad_norm": 106.5, |
| "learning_rate": 4.3985613427467693e-07, |
| "loss": 80.7079, |
| "step": 5040 |
| }, |
| { |
| "epoch": 0.6047529143816117, |
| "grad_norm": 106.3125, |
| "learning_rate": 4.391900892500333e-07, |
| "loss": 80.8943, |
| "step": 5045 |
| }, |
| { |
| "epoch": 0.6053522730678175, |
| "grad_norm": 103.9375, |
| "learning_rate": 4.3852404422538963e-07, |
| "loss": 80.9722, |
| "step": 5050 |
| }, |
| { |
| "epoch": 0.6059516317540232, |
| "grad_norm": 103.0, |
| "learning_rate": 4.3785799920074593e-07, |
| "loss": 80.274, |
| "step": 5055 |
| }, |
| { |
| "epoch": 0.606550990440229, |
| "grad_norm": 103.0, |
| "learning_rate": 4.371919541761023e-07, |
| "loss": 81.1316, |
| "step": 5060 |
| }, |
| { |
| "epoch": 0.6071503491264347, |
| "grad_norm": 107.1875, |
| "learning_rate": 4.365259091514586e-07, |
| "loss": 81.8072, |
| "step": 5065 |
| }, |
| { |
| "epoch": 0.6077497078126405, |
| "grad_norm": 104.5, |
| "learning_rate": 4.35859864126815e-07, |
| "loss": 81.996, |
| "step": 5070 |
| }, |
| { |
| "epoch": 0.6083490664988462, |
| "grad_norm": 107.5625, |
| "learning_rate": 4.351938191021713e-07, |
| "loss": 81.1877, |
| "step": 5075 |
| }, |
| { |
| "epoch": 0.608948425185052, |
| "grad_norm": 108.8125, |
| "learning_rate": 4.345277740775276e-07, |
| "loss": 80.1332, |
| "step": 5080 |
| }, |
| { |
| "epoch": 0.6095477838712577, |
| "grad_norm": 108.9375, |
| "learning_rate": 4.3386172905288394e-07, |
| "loss": 82.2149, |
| "step": 5085 |
| }, |
| { |
| "epoch": 0.6101471425574635, |
| "grad_norm": 100.5625, |
| "learning_rate": 4.331956840282403e-07, |
| "loss": 80.578, |
| "step": 5090 |
| }, |
| { |
| "epoch": 0.6107465012436692, |
| "grad_norm": 106.6875, |
| "learning_rate": 4.3252963900359664e-07, |
| "loss": 79.6262, |
| "step": 5095 |
| }, |
| { |
| "epoch": 0.611345859929875, |
| "grad_norm": 105.3125, |
| "learning_rate": 4.3186359397895294e-07, |
| "loss": 81.7019, |
| "step": 5100 |
| }, |
| { |
| "epoch": 0.6119452186160808, |
| "grad_norm": 101.375, |
| "learning_rate": 4.3119754895430924e-07, |
| "loss": 80.9652, |
| "step": 5105 |
| }, |
| { |
| "epoch": 0.6125445773022865, |
| "grad_norm": 104.125, |
| "learning_rate": 4.3053150392966565e-07, |
| "loss": 79.8046, |
| "step": 5110 |
| }, |
| { |
| "epoch": 0.6131439359884923, |
| "grad_norm": 107.1875, |
| "learning_rate": 4.2986545890502195e-07, |
| "loss": 80.7197, |
| "step": 5115 |
| }, |
| { |
| "epoch": 0.6137432946746981, |
| "grad_norm": 106.25, |
| "learning_rate": 4.291994138803783e-07, |
| "loss": 81.8292, |
| "step": 5120 |
| }, |
| { |
| "epoch": 0.6143426533609039, |
| "grad_norm": 105.0, |
| "learning_rate": 4.2853336885573465e-07, |
| "loss": 80.7553, |
| "step": 5125 |
| }, |
| { |
| "epoch": 0.6149420120471096, |
| "grad_norm": 106.3125, |
| "learning_rate": 4.27867323831091e-07, |
| "loss": 82.0579, |
| "step": 5130 |
| }, |
| { |
| "epoch": 0.6155413707333154, |
| "grad_norm": 104.125, |
| "learning_rate": 4.272012788064473e-07, |
| "loss": 81.464, |
| "step": 5135 |
| }, |
| { |
| "epoch": 0.6161407294195211, |
| "grad_norm": 104.9375, |
| "learning_rate": 4.265352337818036e-07, |
| "loss": 81.48, |
| "step": 5140 |
| }, |
| { |
| "epoch": 0.6167400881057269, |
| "grad_norm": 101.5, |
| "learning_rate": 4.2586918875716e-07, |
| "loss": 80.81, |
| "step": 5145 |
| }, |
| { |
| "epoch": 0.6173394467919326, |
| "grad_norm": 108.5625, |
| "learning_rate": 4.252031437325163e-07, |
| "loss": 82.4736, |
| "step": 5150 |
| }, |
| { |
| "epoch": 0.6179388054781384, |
| "grad_norm": 105.375, |
| "learning_rate": 4.2453709870787266e-07, |
| "loss": 80.7478, |
| "step": 5155 |
| }, |
| { |
| "epoch": 0.6185381641643442, |
| "grad_norm": 105.0625, |
| "learning_rate": 4.2387105368322896e-07, |
| "loss": 81.2949, |
| "step": 5160 |
| }, |
| { |
| "epoch": 0.6191375228505499, |
| "grad_norm": 102.5, |
| "learning_rate": 4.232050086585853e-07, |
| "loss": 82.3897, |
| "step": 5165 |
| }, |
| { |
| "epoch": 0.6197368815367557, |
| "grad_norm": 106.9375, |
| "learning_rate": 4.2253896363394166e-07, |
| "loss": 80.5457, |
| "step": 5170 |
| }, |
| { |
| "epoch": 0.6203362402229614, |
| "grad_norm": 106.375, |
| "learning_rate": 4.2187291860929796e-07, |
| "loss": 81.6608, |
| "step": 5175 |
| }, |
| { |
| "epoch": 0.6209355989091672, |
| "grad_norm": 108.8125, |
| "learning_rate": 4.212068735846543e-07, |
| "loss": 80.4467, |
| "step": 5180 |
| }, |
| { |
| "epoch": 0.6215349575953729, |
| "grad_norm": 105.4375, |
| "learning_rate": 4.2054082856001066e-07, |
| "loss": 81.4107, |
| "step": 5185 |
| }, |
| { |
| "epoch": 0.6221343162815787, |
| "grad_norm": 105.1875, |
| "learning_rate": 4.1987478353536696e-07, |
| "loss": 79.603, |
| "step": 5190 |
| }, |
| { |
| "epoch": 0.6227336749677844, |
| "grad_norm": 107.875, |
| "learning_rate": 4.192087385107233e-07, |
| "loss": 79.1541, |
| "step": 5195 |
| }, |
| { |
| "epoch": 0.6233330336539902, |
| "grad_norm": 108.625, |
| "learning_rate": 4.185426934860796e-07, |
| "loss": 81.4973, |
| "step": 5200 |
| }, |
| { |
| "epoch": 0.623932392340196, |
| "grad_norm": 104.6875, |
| "learning_rate": 4.17876648461436e-07, |
| "loss": 81.097, |
| "step": 5205 |
| }, |
| { |
| "epoch": 0.6245317510264018, |
| "grad_norm": 106.9375, |
| "learning_rate": 4.172106034367923e-07, |
| "loss": 80.2684, |
| "step": 5210 |
| }, |
| { |
| "epoch": 0.6251311097126075, |
| "grad_norm": 105.625, |
| "learning_rate": 4.165445584121486e-07, |
| "loss": 80.8177, |
| "step": 5215 |
| }, |
| { |
| "epoch": 0.6257304683988133, |
| "grad_norm": 103.25, |
| "learning_rate": 4.1587851338750497e-07, |
| "loss": 81.7705, |
| "step": 5220 |
| }, |
| { |
| "epoch": 0.626329827085019, |
| "grad_norm": 107.0, |
| "learning_rate": 4.152124683628613e-07, |
| "loss": 81.2222, |
| "step": 5225 |
| }, |
| { |
| "epoch": 0.6269291857712248, |
| "grad_norm": 106.625, |
| "learning_rate": 4.1454642333821767e-07, |
| "loss": 81.7065, |
| "step": 5230 |
| }, |
| { |
| "epoch": 0.6275285444574306, |
| "grad_norm": 105.0625, |
| "learning_rate": 4.1388037831357397e-07, |
| "loss": 80.9833, |
| "step": 5235 |
| }, |
| { |
| "epoch": 0.6281279031436363, |
| "grad_norm": 107.625, |
| "learning_rate": 4.1321433328893027e-07, |
| "loss": 80.2564, |
| "step": 5240 |
| }, |
| { |
| "epoch": 0.6287272618298421, |
| "grad_norm": 104.875, |
| "learning_rate": 4.125482882642867e-07, |
| "loss": 80.1978, |
| "step": 5245 |
| }, |
| { |
| "epoch": 0.6293266205160478, |
| "grad_norm": 107.4375, |
| "learning_rate": 4.11882243239643e-07, |
| "loss": 80.0157, |
| "step": 5250 |
| }, |
| { |
| "epoch": 0.6299259792022536, |
| "grad_norm": 106.5625, |
| "learning_rate": 4.1121619821499933e-07, |
| "loss": 79.8329, |
| "step": 5255 |
| }, |
| { |
| "epoch": 0.6305253378884593, |
| "grad_norm": 104.9375, |
| "learning_rate": 4.105501531903556e-07, |
| "loss": 80.725, |
| "step": 5260 |
| }, |
| { |
| "epoch": 0.6311246965746651, |
| "grad_norm": 104.6875, |
| "learning_rate": 4.0988410816571203e-07, |
| "loss": 81.1629, |
| "step": 5265 |
| }, |
| { |
| "epoch": 0.6317240552608708, |
| "grad_norm": 106.875, |
| "learning_rate": 4.0921806314106833e-07, |
| "loss": 79.0839, |
| "step": 5270 |
| }, |
| { |
| "epoch": 0.6323234139470766, |
| "grad_norm": 102.375, |
| "learning_rate": 4.0855201811642463e-07, |
| "loss": 81.4344, |
| "step": 5275 |
| }, |
| { |
| "epoch": 0.6329227726332823, |
| "grad_norm": 103.125, |
| "learning_rate": 4.07885973091781e-07, |
| "loss": 79.7944, |
| "step": 5280 |
| }, |
| { |
| "epoch": 0.6335221313194882, |
| "grad_norm": 107.6875, |
| "learning_rate": 4.0721992806713733e-07, |
| "loss": 80.6244, |
| "step": 5285 |
| }, |
| { |
| "epoch": 0.634121490005694, |
| "grad_norm": 105.1875, |
| "learning_rate": 4.065538830424937e-07, |
| "loss": 82.5008, |
| "step": 5290 |
| }, |
| { |
| "epoch": 0.6347208486918997, |
| "grad_norm": 102.625, |
| "learning_rate": 4.0588783801785e-07, |
| "loss": 80.2262, |
| "step": 5295 |
| }, |
| { |
| "epoch": 0.6353202073781055, |
| "grad_norm": 103.875, |
| "learning_rate": 4.052217929932063e-07, |
| "loss": 80.5795, |
| "step": 5300 |
| }, |
| { |
| "epoch": 0.6359195660643112, |
| "grad_norm": 106.5, |
| "learning_rate": 4.045557479685627e-07, |
| "loss": 80.6724, |
| "step": 5305 |
| }, |
| { |
| "epoch": 0.636518924750517, |
| "grad_norm": 108.3125, |
| "learning_rate": 4.03889702943919e-07, |
| "loss": 80.1853, |
| "step": 5310 |
| }, |
| { |
| "epoch": 0.6371182834367227, |
| "grad_norm": 106.6875, |
| "learning_rate": 4.0322365791927534e-07, |
| "loss": 81.406, |
| "step": 5315 |
| }, |
| { |
| "epoch": 0.6377176421229285, |
| "grad_norm": 105.25, |
| "learning_rate": 4.0255761289463164e-07, |
| "loss": 79.3839, |
| "step": 5320 |
| }, |
| { |
| "epoch": 0.6383170008091342, |
| "grad_norm": 105.0, |
| "learning_rate": 4.01891567869988e-07, |
| "loss": 81.6353, |
| "step": 5325 |
| }, |
| { |
| "epoch": 0.63891635949534, |
| "grad_norm": 104.5, |
| "learning_rate": 4.0122552284534434e-07, |
| "loss": 79.2809, |
| "step": 5330 |
| }, |
| { |
| "epoch": 0.6395157181815457, |
| "grad_norm": 103.4375, |
| "learning_rate": 4.0055947782070064e-07, |
| "loss": 80.5541, |
| "step": 5335 |
| }, |
| { |
| "epoch": 0.6401150768677515, |
| "grad_norm": 107.3125, |
| "learning_rate": 3.99893432796057e-07, |
| "loss": 80.4581, |
| "step": 5340 |
| }, |
| { |
| "epoch": 0.6407144355539572, |
| "grad_norm": 106.5625, |
| "learning_rate": 3.9922738777141335e-07, |
| "loss": 79.9925, |
| "step": 5345 |
| }, |
| { |
| "epoch": 0.641313794240163, |
| "grad_norm": 106.1875, |
| "learning_rate": 3.9856134274676965e-07, |
| "loss": 80.1636, |
| "step": 5350 |
| }, |
| { |
| "epoch": 0.6419131529263687, |
| "grad_norm": 105.3125, |
| "learning_rate": 3.97895297722126e-07, |
| "loss": 80.483, |
| "step": 5355 |
| }, |
| { |
| "epoch": 0.6425125116125745, |
| "grad_norm": 107.8125, |
| "learning_rate": 3.972292526974823e-07, |
| "loss": 80.7774, |
| "step": 5360 |
| }, |
| { |
| "epoch": 0.6431118702987803, |
| "grad_norm": 107.375, |
| "learning_rate": 3.965632076728387e-07, |
| "loss": 81.4123, |
| "step": 5365 |
| }, |
| { |
| "epoch": 0.6437112289849861, |
| "grad_norm": 104.875, |
| "learning_rate": 3.95897162648195e-07, |
| "loss": 80.1176, |
| "step": 5370 |
| }, |
| { |
| "epoch": 0.6443105876711919, |
| "grad_norm": 105.8125, |
| "learning_rate": 3.952311176235513e-07, |
| "loss": 80.2847, |
| "step": 5375 |
| }, |
| { |
| "epoch": 0.6449099463573976, |
| "grad_norm": 105.0625, |
| "learning_rate": 3.9456507259890765e-07, |
| "loss": 80.2917, |
| "step": 5380 |
| }, |
| { |
| "epoch": 0.6455093050436034, |
| "grad_norm": 105.125, |
| "learning_rate": 3.93899027574264e-07, |
| "loss": 80.3692, |
| "step": 5385 |
| }, |
| { |
| "epoch": 0.6461086637298091, |
| "grad_norm": 105.8125, |
| "learning_rate": 3.9323298254962036e-07, |
| "loss": 79.7248, |
| "step": 5390 |
| }, |
| { |
| "epoch": 0.6467080224160149, |
| "grad_norm": 109.75, |
| "learning_rate": 3.9256693752497665e-07, |
| "loss": 79.5496, |
| "step": 5395 |
| }, |
| { |
| "epoch": 0.6473073811022206, |
| "grad_norm": 105.5625, |
| "learning_rate": 3.91900892500333e-07, |
| "loss": 79.3702, |
| "step": 5400 |
| }, |
| { |
| "epoch": 0.6479067397884264, |
| "grad_norm": 104.75, |
| "learning_rate": 3.9123484747568936e-07, |
| "loss": 81.0996, |
| "step": 5405 |
| }, |
| { |
| "epoch": 0.6485060984746321, |
| "grad_norm": 104.75, |
| "learning_rate": 3.9056880245104566e-07, |
| "loss": 79.2324, |
| "step": 5410 |
| }, |
| { |
| "epoch": 0.6491054571608379, |
| "grad_norm": 108.1875, |
| "learning_rate": 3.89902757426402e-07, |
| "loss": 80.0891, |
| "step": 5415 |
| }, |
| { |
| "epoch": 0.6497048158470436, |
| "grad_norm": 107.375, |
| "learning_rate": 3.892367124017583e-07, |
| "loss": 79.303, |
| "step": 5420 |
| }, |
| { |
| "epoch": 0.6503041745332494, |
| "grad_norm": 107.1875, |
| "learning_rate": 3.885706673771147e-07, |
| "loss": 80.5127, |
| "step": 5425 |
| }, |
| { |
| "epoch": 0.6509035332194552, |
| "grad_norm": 102.9375, |
| "learning_rate": 3.87904622352471e-07, |
| "loss": 80.0789, |
| "step": 5430 |
| }, |
| { |
| "epoch": 0.6515028919056609, |
| "grad_norm": 104.625, |
| "learning_rate": 3.872385773278273e-07, |
| "loss": 80.4981, |
| "step": 5435 |
| }, |
| { |
| "epoch": 0.6521022505918667, |
| "grad_norm": 105.6875, |
| "learning_rate": 3.8657253230318366e-07, |
| "loss": 79.4556, |
| "step": 5440 |
| }, |
| { |
| "epoch": 0.6527016092780724, |
| "grad_norm": 104.625, |
| "learning_rate": 3.8590648727854e-07, |
| "loss": 81.356, |
| "step": 5445 |
| }, |
| { |
| "epoch": 0.6533009679642782, |
| "grad_norm": 105.125, |
| "learning_rate": 3.8524044225389637e-07, |
| "loss": 80.1964, |
| "step": 5450 |
| }, |
| { |
| "epoch": 0.653900326650484, |
| "grad_norm": 105.6875, |
| "learning_rate": 3.8457439722925267e-07, |
| "loss": 80.0771, |
| "step": 5455 |
| }, |
| { |
| "epoch": 0.6544996853366898, |
| "grad_norm": 106.0625, |
| "learning_rate": 3.8390835220460897e-07, |
| "loss": 81.1686, |
| "step": 5460 |
| }, |
| { |
| "epoch": 0.6550990440228955, |
| "grad_norm": 107.1875, |
| "learning_rate": 3.8324230717996537e-07, |
| "loss": 79.9953, |
| "step": 5465 |
| }, |
| { |
| "epoch": 0.6556984027091013, |
| "grad_norm": 109.6875, |
| "learning_rate": 3.8257626215532167e-07, |
| "loss": 80.6024, |
| "step": 5470 |
| }, |
| { |
| "epoch": 0.656297761395307, |
| "grad_norm": 103.625, |
| "learning_rate": 3.81910217130678e-07, |
| "loss": 81.1354, |
| "step": 5475 |
| }, |
| { |
| "epoch": 0.6568971200815128, |
| "grad_norm": 106.8125, |
| "learning_rate": 3.812441721060343e-07, |
| "loss": 79.0769, |
| "step": 5480 |
| }, |
| { |
| "epoch": 0.6574964787677186, |
| "grad_norm": 104.75, |
| "learning_rate": 3.805781270813907e-07, |
| "loss": 80.8966, |
| "step": 5485 |
| }, |
| { |
| "epoch": 0.6580958374539243, |
| "grad_norm": 105.25, |
| "learning_rate": 3.79912082056747e-07, |
| "loss": 78.8852, |
| "step": 5490 |
| }, |
| { |
| "epoch": 0.6586951961401301, |
| "grad_norm": 107.25, |
| "learning_rate": 3.792460370321033e-07, |
| "loss": 80.7936, |
| "step": 5495 |
| }, |
| { |
| "epoch": 0.6592945548263358, |
| "grad_norm": 106.6875, |
| "learning_rate": 3.7857999200745973e-07, |
| "loss": 79.082, |
| "step": 5500 |
| }, |
| { |
| "epoch": 0.6592945548263358, |
| "eval_loss": 2.5001039505004883, |
| "eval_runtime": 405.6723, |
| "eval_samples_per_second": 1108.291, |
| "eval_steps_per_second": 34.636, |
| "step": 5500 |
| }, |
| { |
| "epoch": 0.6598939135125416, |
| "grad_norm": 104.375, |
| "learning_rate": 3.7791394698281603e-07, |
| "loss": 79.8162, |
| "step": 5505 |
| }, |
| { |
| "epoch": 0.6604932721987473, |
| "grad_norm": 108.25, |
| "learning_rate": 3.7724790195817233e-07, |
| "loss": 80.6216, |
| "step": 5510 |
| }, |
| { |
| "epoch": 0.6610926308849531, |
| "grad_norm": 106.25, |
| "learning_rate": 3.765818569335287e-07, |
| "loss": 80.1501, |
| "step": 5515 |
| }, |
| { |
| "epoch": 0.6616919895711588, |
| "grad_norm": 105.9375, |
| "learning_rate": 3.7591581190888503e-07, |
| "loss": 80.254, |
| "step": 5520 |
| }, |
| { |
| "epoch": 0.6622913482573646, |
| "grad_norm": 105.9375, |
| "learning_rate": 3.752497668842414e-07, |
| "loss": 80.2296, |
| "step": 5525 |
| }, |
| { |
| "epoch": 0.6628907069435703, |
| "grad_norm": 104.75, |
| "learning_rate": 3.745837218595977e-07, |
| "loss": 79.7098, |
| "step": 5530 |
| }, |
| { |
| "epoch": 0.6634900656297762, |
| "grad_norm": 105.9375, |
| "learning_rate": 3.7391767683495404e-07, |
| "loss": 80.7595, |
| "step": 5535 |
| }, |
| { |
| "epoch": 0.664089424315982, |
| "grad_norm": 106.0, |
| "learning_rate": 3.732516318103104e-07, |
| "loss": 79.3832, |
| "step": 5540 |
| }, |
| { |
| "epoch": 0.6646887830021877, |
| "grad_norm": 104.0, |
| "learning_rate": 3.725855867856667e-07, |
| "loss": 80.1191, |
| "step": 5545 |
| }, |
| { |
| "epoch": 0.6652881416883935, |
| "grad_norm": 104.0625, |
| "learning_rate": 3.7191954176102304e-07, |
| "loss": 79.0603, |
| "step": 5550 |
| }, |
| { |
| "epoch": 0.6658875003745992, |
| "grad_norm": 109.0, |
| "learning_rate": 3.7125349673637934e-07, |
| "loss": 79.1256, |
| "step": 5555 |
| }, |
| { |
| "epoch": 0.666486859060805, |
| "grad_norm": 104.5625, |
| "learning_rate": 3.7058745171173574e-07, |
| "loss": 79.9635, |
| "step": 5560 |
| }, |
| { |
| "epoch": 0.6670862177470107, |
| "grad_norm": 105.4375, |
| "learning_rate": 3.6992140668709204e-07, |
| "loss": 80.7606, |
| "step": 5565 |
| }, |
| { |
| "epoch": 0.6676855764332165, |
| "grad_norm": 103.375, |
| "learning_rate": 3.6925536166244834e-07, |
| "loss": 78.173, |
| "step": 5570 |
| }, |
| { |
| "epoch": 0.6682849351194222, |
| "grad_norm": 103.125, |
| "learning_rate": 3.685893166378047e-07, |
| "loss": 79.259, |
| "step": 5575 |
| }, |
| { |
| "epoch": 0.668884293805628, |
| "grad_norm": 107.5625, |
| "learning_rate": 3.6792327161316104e-07, |
| "loss": 80.321, |
| "step": 5580 |
| }, |
| { |
| "epoch": 0.6694836524918337, |
| "grad_norm": 104.0, |
| "learning_rate": 3.672572265885174e-07, |
| "loss": 79.958, |
| "step": 5585 |
| }, |
| { |
| "epoch": 0.6700830111780395, |
| "grad_norm": 106.25, |
| "learning_rate": 3.665911815638737e-07, |
| "loss": 79.1322, |
| "step": 5590 |
| }, |
| { |
| "epoch": 0.6706823698642452, |
| "grad_norm": 106.125, |
| "learning_rate": 3.6592513653923e-07, |
| "loss": 79.3344, |
| "step": 5595 |
| }, |
| { |
| "epoch": 0.671281728550451, |
| "grad_norm": 108.0, |
| "learning_rate": 3.652590915145864e-07, |
| "loss": 79.35, |
| "step": 5600 |
| }, |
| { |
| "epoch": 0.6718810872366567, |
| "grad_norm": 101.375, |
| "learning_rate": 3.645930464899427e-07, |
| "loss": 78.8445, |
| "step": 5605 |
| }, |
| { |
| "epoch": 0.6724804459228625, |
| "grad_norm": 104.5625, |
| "learning_rate": 3.6392700146529905e-07, |
| "loss": 80.2318, |
| "step": 5610 |
| }, |
| { |
| "epoch": 0.6730798046090682, |
| "grad_norm": 105.8125, |
| "learning_rate": 3.6326095644065535e-07, |
| "loss": 79.9447, |
| "step": 5615 |
| }, |
| { |
| "epoch": 0.6736791632952741, |
| "grad_norm": 103.5625, |
| "learning_rate": 3.625949114160117e-07, |
| "loss": 79.6821, |
| "step": 5620 |
| }, |
| { |
| "epoch": 0.6742785219814799, |
| "grad_norm": 105.625, |
| "learning_rate": 3.6192886639136805e-07, |
| "loss": 79.4833, |
| "step": 5625 |
| }, |
| { |
| "epoch": 0.6748778806676856, |
| "grad_norm": 103.5625, |
| "learning_rate": 3.6126282136672435e-07, |
| "loss": 80.9992, |
| "step": 5630 |
| }, |
| { |
| "epoch": 0.6754772393538914, |
| "grad_norm": 106.6875, |
| "learning_rate": 3.605967763420807e-07, |
| "loss": 80.4254, |
| "step": 5635 |
| }, |
| { |
| "epoch": 0.6760765980400971, |
| "grad_norm": 105.625, |
| "learning_rate": 3.5993073131743706e-07, |
| "loss": 79.7526, |
| "step": 5640 |
| }, |
| { |
| "epoch": 0.6766759567263029, |
| "grad_norm": 107.75, |
| "learning_rate": 3.592646862927934e-07, |
| "loss": 78.7197, |
| "step": 5645 |
| }, |
| { |
| "epoch": 0.6772753154125086, |
| "grad_norm": 104.375, |
| "learning_rate": 3.585986412681497e-07, |
| "loss": 79.867, |
| "step": 5650 |
| }, |
| { |
| "epoch": 0.6778746740987144, |
| "grad_norm": 104.0, |
| "learning_rate": 3.57932596243506e-07, |
| "loss": 79.7559, |
| "step": 5655 |
| }, |
| { |
| "epoch": 0.6784740327849201, |
| "grad_norm": 104.875, |
| "learning_rate": 3.572665512188624e-07, |
| "loss": 79.7349, |
| "step": 5660 |
| }, |
| { |
| "epoch": 0.6790733914711259, |
| "grad_norm": 106.6875, |
| "learning_rate": 3.566005061942187e-07, |
| "loss": 80.9384, |
| "step": 5665 |
| }, |
| { |
| "epoch": 0.6796727501573316, |
| "grad_norm": 107.625, |
| "learning_rate": 3.5593446116957506e-07, |
| "loss": 79.7798, |
| "step": 5670 |
| }, |
| { |
| "epoch": 0.6802721088435374, |
| "grad_norm": 107.5625, |
| "learning_rate": 3.5526841614493136e-07, |
| "loss": 79.8944, |
| "step": 5675 |
| }, |
| { |
| "epoch": 0.6808714675297431, |
| "grad_norm": 104.3125, |
| "learning_rate": 3.546023711202877e-07, |
| "loss": 79.2222, |
| "step": 5680 |
| }, |
| { |
| "epoch": 0.6814708262159489, |
| "grad_norm": 105.0, |
| "learning_rate": 3.5393632609564407e-07, |
| "loss": 79.4649, |
| "step": 5685 |
| }, |
| { |
| "epoch": 0.6820701849021547, |
| "grad_norm": 106.1875, |
| "learning_rate": 3.5327028107100037e-07, |
| "loss": 79.6086, |
| "step": 5690 |
| }, |
| { |
| "epoch": 0.6826695435883604, |
| "grad_norm": 105.9375, |
| "learning_rate": 3.526042360463567e-07, |
| "loss": 81.1845, |
| "step": 5695 |
| }, |
| { |
| "epoch": 0.6832689022745662, |
| "grad_norm": 104.5625, |
| "learning_rate": 3.5193819102171307e-07, |
| "loss": 80.7964, |
| "step": 5700 |
| }, |
| { |
| "epoch": 0.683868260960772, |
| "grad_norm": 106.8125, |
| "learning_rate": 3.5127214599706937e-07, |
| "loss": 80.6059, |
| "step": 5705 |
| }, |
| { |
| "epoch": 0.6844676196469778, |
| "grad_norm": 107.875, |
| "learning_rate": 3.506061009724257e-07, |
| "loss": 79.1676, |
| "step": 5710 |
| }, |
| { |
| "epoch": 0.6850669783331835, |
| "grad_norm": 107.375, |
| "learning_rate": 3.49940055947782e-07, |
| "loss": 78.7386, |
| "step": 5715 |
| }, |
| { |
| "epoch": 0.6856663370193893, |
| "grad_norm": 107.4375, |
| "learning_rate": 3.492740109231384e-07, |
| "loss": 80.582, |
| "step": 5720 |
| }, |
| { |
| "epoch": 0.686265695705595, |
| "grad_norm": 106.0, |
| "learning_rate": 3.486079658984947e-07, |
| "loss": 78.4127, |
| "step": 5725 |
| }, |
| { |
| "epoch": 0.6868650543918008, |
| "grad_norm": 103.375, |
| "learning_rate": 3.47941920873851e-07, |
| "loss": 79.0533, |
| "step": 5730 |
| }, |
| { |
| "epoch": 0.6874644130780065, |
| "grad_norm": 105.125, |
| "learning_rate": 3.472758758492074e-07, |
| "loss": 80.4304, |
| "step": 5735 |
| }, |
| { |
| "epoch": 0.6880637717642123, |
| "grad_norm": 106.125, |
| "learning_rate": 3.4660983082456373e-07, |
| "loss": 80.7424, |
| "step": 5740 |
| }, |
| { |
| "epoch": 0.688663130450418, |
| "grad_norm": 107.0, |
| "learning_rate": 3.459437857999201e-07, |
| "loss": 80.3244, |
| "step": 5745 |
| }, |
| { |
| "epoch": 0.6892624891366238, |
| "grad_norm": 103.1875, |
| "learning_rate": 3.452777407752764e-07, |
| "loss": 79.961, |
| "step": 5750 |
| }, |
| { |
| "epoch": 0.6898618478228296, |
| "grad_norm": 104.9375, |
| "learning_rate": 3.446116957506327e-07, |
| "loss": 78.8729, |
| "step": 5755 |
| }, |
| { |
| "epoch": 0.6904612065090353, |
| "grad_norm": 109.0625, |
| "learning_rate": 3.439456507259891e-07, |
| "loss": 80.4733, |
| "step": 5760 |
| }, |
| { |
| "epoch": 0.6910605651952411, |
| "grad_norm": 106.4375, |
| "learning_rate": 3.432796057013454e-07, |
| "loss": 78.2223, |
| "step": 5765 |
| }, |
| { |
| "epoch": 0.6916599238814468, |
| "grad_norm": 106.3125, |
| "learning_rate": 3.4261356067670173e-07, |
| "loss": 80.0241, |
| "step": 5770 |
| }, |
| { |
| "epoch": 0.6922592825676526, |
| "grad_norm": 103.6875, |
| "learning_rate": 3.4194751565205803e-07, |
| "loss": 78.9867, |
| "step": 5775 |
| }, |
| { |
| "epoch": 0.6928586412538583, |
| "grad_norm": 105.75, |
| "learning_rate": 3.4128147062741444e-07, |
| "loss": 78.7172, |
| "step": 5780 |
| }, |
| { |
| "epoch": 0.6934579999400642, |
| "grad_norm": 104.25, |
| "learning_rate": 3.4061542560277074e-07, |
| "loss": 79.7752, |
| "step": 5785 |
| }, |
| { |
| "epoch": 0.6940573586262699, |
| "grad_norm": 106.5625, |
| "learning_rate": 3.3994938057812704e-07, |
| "loss": 79.1381, |
| "step": 5790 |
| }, |
| { |
| "epoch": 0.6946567173124757, |
| "grad_norm": 105.3125, |
| "learning_rate": 3.392833355534834e-07, |
| "loss": 78.426, |
| "step": 5795 |
| }, |
| { |
| "epoch": 0.6952560759986814, |
| "grad_norm": 106.8125, |
| "learning_rate": 3.3861729052883974e-07, |
| "loss": 79.6124, |
| "step": 5800 |
| }, |
| { |
| "epoch": 0.6958554346848872, |
| "grad_norm": 104.875, |
| "learning_rate": 3.379512455041961e-07, |
| "loss": 79.5376, |
| "step": 5805 |
| }, |
| { |
| "epoch": 0.696454793371093, |
| "grad_norm": 105.0625, |
| "learning_rate": 3.372852004795524e-07, |
| "loss": 77.9224, |
| "step": 5810 |
| }, |
| { |
| "epoch": 0.6970541520572987, |
| "grad_norm": 104.125, |
| "learning_rate": 3.366191554549087e-07, |
| "loss": 79.9884, |
| "step": 5815 |
| }, |
| { |
| "epoch": 0.6976535107435045, |
| "grad_norm": 105.5, |
| "learning_rate": 3.359531104302651e-07, |
| "loss": 80.5143, |
| "step": 5820 |
| }, |
| { |
| "epoch": 0.6982528694297102, |
| "grad_norm": 108.8125, |
| "learning_rate": 3.352870654056214e-07, |
| "loss": 79.1192, |
| "step": 5825 |
| }, |
| { |
| "epoch": 0.698852228115916, |
| "grad_norm": 106.125, |
| "learning_rate": 3.3462102038097775e-07, |
| "loss": 79.9099, |
| "step": 5830 |
| }, |
| { |
| "epoch": 0.6994515868021217, |
| "grad_norm": 106.3125, |
| "learning_rate": 3.3395497535633405e-07, |
| "loss": 79.8999, |
| "step": 5835 |
| }, |
| { |
| "epoch": 0.7000509454883275, |
| "grad_norm": 103.75, |
| "learning_rate": 3.332889303316904e-07, |
| "loss": 79.162, |
| "step": 5840 |
| }, |
| { |
| "epoch": 0.7006503041745332, |
| "grad_norm": 105.625, |
| "learning_rate": 3.3262288530704675e-07, |
| "loss": 78.1294, |
| "step": 5845 |
| }, |
| { |
| "epoch": 0.701249662860739, |
| "grad_norm": 105.125, |
| "learning_rate": 3.3195684028240305e-07, |
| "loss": 78.6101, |
| "step": 5850 |
| }, |
| { |
| "epoch": 0.7018490215469447, |
| "grad_norm": 105.375, |
| "learning_rate": 3.3129079525775945e-07, |
| "loss": 77.9655, |
| "step": 5855 |
| }, |
| { |
| "epoch": 0.7024483802331505, |
| "grad_norm": 107.9375, |
| "learning_rate": 3.3062475023311575e-07, |
| "loss": 77.3438, |
| "step": 5860 |
| }, |
| { |
| "epoch": 0.7030477389193562, |
| "grad_norm": 105.5, |
| "learning_rate": 3.2995870520847205e-07, |
| "loss": 78.5834, |
| "step": 5865 |
| }, |
| { |
| "epoch": 0.7036470976055621, |
| "grad_norm": 105.8125, |
| "learning_rate": 3.292926601838284e-07, |
| "loss": 77.6612, |
| "step": 5870 |
| }, |
| { |
| "epoch": 0.7042464562917679, |
| "grad_norm": 103.375, |
| "learning_rate": 3.2862661515918476e-07, |
| "loss": 77.863, |
| "step": 5875 |
| }, |
| { |
| "epoch": 0.7048458149779736, |
| "grad_norm": 103.625, |
| "learning_rate": 3.279605701345411e-07, |
| "loss": 80.0449, |
| "step": 5880 |
| }, |
| { |
| "epoch": 0.7054451736641794, |
| "grad_norm": 109.125, |
| "learning_rate": 3.272945251098974e-07, |
| "loss": 78.6147, |
| "step": 5885 |
| }, |
| { |
| "epoch": 0.7060445323503851, |
| "grad_norm": 104.75, |
| "learning_rate": 3.266284800852537e-07, |
| "loss": 79.8518, |
| "step": 5890 |
| }, |
| { |
| "epoch": 0.7066438910365909, |
| "grad_norm": 104.0, |
| "learning_rate": 3.259624350606101e-07, |
| "loss": 79.7573, |
| "step": 5895 |
| }, |
| { |
| "epoch": 0.7072432497227966, |
| "grad_norm": 108.0, |
| "learning_rate": 3.252963900359664e-07, |
| "loss": 79.0408, |
| "step": 5900 |
| }, |
| { |
| "epoch": 0.7078426084090024, |
| "grad_norm": 107.0625, |
| "learning_rate": 3.2463034501132276e-07, |
| "loss": 78.054, |
| "step": 5905 |
| }, |
| { |
| "epoch": 0.7084419670952081, |
| "grad_norm": 109.4375, |
| "learning_rate": 3.2396429998667906e-07, |
| "loss": 78.9491, |
| "step": 5910 |
| }, |
| { |
| "epoch": 0.7090413257814139, |
| "grad_norm": 105.6875, |
| "learning_rate": 3.2329825496203547e-07, |
| "loss": 78.7078, |
| "step": 5915 |
| }, |
| { |
| "epoch": 0.7096406844676196, |
| "grad_norm": 107.25, |
| "learning_rate": 3.2263220993739177e-07, |
| "loss": 79.0629, |
| "step": 5920 |
| }, |
| { |
| "epoch": 0.7102400431538254, |
| "grad_norm": 106.875, |
| "learning_rate": 3.2196616491274806e-07, |
| "loss": 77.7746, |
| "step": 5925 |
| }, |
| { |
| "epoch": 0.7108394018400311, |
| "grad_norm": 108.375, |
| "learning_rate": 3.213001198881044e-07, |
| "loss": 78.0724, |
| "step": 5930 |
| }, |
| { |
| "epoch": 0.7114387605262369, |
| "grad_norm": 106.125, |
| "learning_rate": 3.2063407486346077e-07, |
| "loss": 78.7447, |
| "step": 5935 |
| }, |
| { |
| "epoch": 0.7120381192124426, |
| "grad_norm": 106.0625, |
| "learning_rate": 3.199680298388171e-07, |
| "loss": 77.4366, |
| "step": 5940 |
| }, |
| { |
| "epoch": 0.7126374778986484, |
| "grad_norm": 105.125, |
| "learning_rate": 3.193019848141734e-07, |
| "loss": 79.7885, |
| "step": 5945 |
| }, |
| { |
| "epoch": 0.7132368365848543, |
| "grad_norm": 102.8125, |
| "learning_rate": 3.186359397895297e-07, |
| "loss": 78.1937, |
| "step": 5950 |
| }, |
| { |
| "epoch": 0.71383619527106, |
| "grad_norm": 105.875, |
| "learning_rate": 3.179698947648861e-07, |
| "loss": 80.1111, |
| "step": 5955 |
| }, |
| { |
| "epoch": 0.7144355539572658, |
| "grad_norm": 105.8125, |
| "learning_rate": 3.173038497402424e-07, |
| "loss": 78.5599, |
| "step": 5960 |
| }, |
| { |
| "epoch": 0.7150349126434715, |
| "grad_norm": 106.25, |
| "learning_rate": 3.166378047155988e-07, |
| "loss": 78.213, |
| "step": 5965 |
| }, |
| { |
| "epoch": 0.7156342713296773, |
| "grad_norm": 108.3125, |
| "learning_rate": 3.159717596909551e-07, |
| "loss": 79.0787, |
| "step": 5970 |
| }, |
| { |
| "epoch": 0.716233630015883, |
| "grad_norm": 105.0, |
| "learning_rate": 3.153057146663114e-07, |
| "loss": 79.603, |
| "step": 5975 |
| }, |
| { |
| "epoch": 0.7168329887020888, |
| "grad_norm": 106.0, |
| "learning_rate": 3.146396696416678e-07, |
| "loss": 78.3208, |
| "step": 5980 |
| }, |
| { |
| "epoch": 0.7174323473882945, |
| "grad_norm": 109.4375, |
| "learning_rate": 3.139736246170241e-07, |
| "loss": 79.0942, |
| "step": 5985 |
| }, |
| { |
| "epoch": 0.7180317060745003, |
| "grad_norm": 107.25, |
| "learning_rate": 3.1330757959238043e-07, |
| "loss": 78.9856, |
| "step": 5990 |
| }, |
| { |
| "epoch": 0.718631064760706, |
| "grad_norm": 104.4375, |
| "learning_rate": 3.126415345677368e-07, |
| "loss": 79.374, |
| "step": 5995 |
| }, |
| { |
| "epoch": 0.7192304234469118, |
| "grad_norm": 106.4375, |
| "learning_rate": 3.119754895430931e-07, |
| "loss": 78.3493, |
| "step": 6000 |
| }, |
| { |
| "epoch": 0.7192304234469118, |
| "eval_loss": 2.469325542449951, |
| "eval_runtime": 404.8602, |
| "eval_samples_per_second": 1110.514, |
| "eval_steps_per_second": 34.706, |
| "step": 6000 |
| }, |
| { |
| "epoch": 0.7198297821331175, |
| "grad_norm": 106.875, |
| "learning_rate": 3.1130944451844943e-07, |
| "loss": 77.937, |
| "step": 6005 |
| }, |
| { |
| "epoch": 0.7204291408193233, |
| "grad_norm": 107.0625, |
| "learning_rate": 3.1064339949380573e-07, |
| "loss": 79.7142, |
| "step": 6010 |
| }, |
| { |
| "epoch": 0.721028499505529, |
| "grad_norm": 108.875, |
| "learning_rate": 3.0997735446916214e-07, |
| "loss": 78.1682, |
| "step": 6015 |
| }, |
| { |
| "epoch": 0.7216278581917348, |
| "grad_norm": 106.75, |
| "learning_rate": 3.0931130944451844e-07, |
| "loss": 77.3161, |
| "step": 6020 |
| }, |
| { |
| "epoch": 0.7222272168779406, |
| "grad_norm": 105.5, |
| "learning_rate": 3.086452644198748e-07, |
| "loss": 79.6543, |
| "step": 6025 |
| }, |
| { |
| "epoch": 0.7228265755641463, |
| "grad_norm": 112.125, |
| "learning_rate": 3.079792193952311e-07, |
| "loss": 78.0319, |
| "step": 6030 |
| }, |
| { |
| "epoch": 0.7234259342503522, |
| "grad_norm": 105.875, |
| "learning_rate": 3.0731317437058744e-07, |
| "loss": 78.8495, |
| "step": 6035 |
| }, |
| { |
| "epoch": 0.7240252929365579, |
| "grad_norm": 104.3125, |
| "learning_rate": 3.066471293459438e-07, |
| "loss": 78.3328, |
| "step": 6040 |
| }, |
| { |
| "epoch": 0.7246246516227637, |
| "grad_norm": 104.375, |
| "learning_rate": 3.059810843213001e-07, |
| "loss": 78.7412, |
| "step": 6045 |
| }, |
| { |
| "epoch": 0.7252240103089694, |
| "grad_norm": 107.25, |
| "learning_rate": 3.0531503929665644e-07, |
| "loss": 79.4384, |
| "step": 6050 |
| }, |
| { |
| "epoch": 0.7258233689951752, |
| "grad_norm": 103.0, |
| "learning_rate": 3.046489942720128e-07, |
| "loss": 77.85, |
| "step": 6055 |
| }, |
| { |
| "epoch": 0.7264227276813809, |
| "grad_norm": 107.3125, |
| "learning_rate": 3.039829492473691e-07, |
| "loss": 78.9541, |
| "step": 6060 |
| }, |
| { |
| "epoch": 0.7270220863675867, |
| "grad_norm": 105.4375, |
| "learning_rate": 3.0331690422272545e-07, |
| "loss": 80.1152, |
| "step": 6065 |
| }, |
| { |
| "epoch": 0.7276214450537924, |
| "grad_norm": 106.8125, |
| "learning_rate": 3.0265085919808174e-07, |
| "loss": 79.0895, |
| "step": 6070 |
| }, |
| { |
| "epoch": 0.7282208037399982, |
| "grad_norm": 108.0625, |
| "learning_rate": 3.0198481417343815e-07, |
| "loss": 79.5219, |
| "step": 6075 |
| }, |
| { |
| "epoch": 0.728820162426204, |
| "grad_norm": 107.25, |
| "learning_rate": 3.0131876914879445e-07, |
| "loss": 78.7073, |
| "step": 6080 |
| }, |
| { |
| "epoch": 0.7294195211124097, |
| "grad_norm": 106.875, |
| "learning_rate": 3.0065272412415075e-07, |
| "loss": 78.4252, |
| "step": 6085 |
| }, |
| { |
| "epoch": 0.7300188797986155, |
| "grad_norm": 106.125, |
| "learning_rate": 2.999866790995071e-07, |
| "loss": 80.2118, |
| "step": 6090 |
| }, |
| { |
| "epoch": 0.7306182384848212, |
| "grad_norm": 109.125, |
| "learning_rate": 2.9932063407486345e-07, |
| "loss": 78.9803, |
| "step": 6095 |
| }, |
| { |
| "epoch": 0.731217597171027, |
| "grad_norm": 106.0625, |
| "learning_rate": 2.986545890502198e-07, |
| "loss": 79.6833, |
| "step": 6100 |
| }, |
| { |
| "epoch": 0.7318169558572327, |
| "grad_norm": 107.375, |
| "learning_rate": 2.979885440255761e-07, |
| "loss": 78.3041, |
| "step": 6105 |
| }, |
| { |
| "epoch": 0.7324163145434385, |
| "grad_norm": 108.3125, |
| "learning_rate": 2.973224990009324e-07, |
| "loss": 77.6695, |
| "step": 6110 |
| }, |
| { |
| "epoch": 0.7330156732296442, |
| "grad_norm": 106.3125, |
| "learning_rate": 2.966564539762888e-07, |
| "loss": 78.813, |
| "step": 6115 |
| }, |
| { |
| "epoch": 0.7336150319158501, |
| "grad_norm": 104.625, |
| "learning_rate": 2.959904089516451e-07, |
| "loss": 78.1065, |
| "step": 6120 |
| }, |
| { |
| "epoch": 0.7342143906020558, |
| "grad_norm": 102.25, |
| "learning_rate": 2.9532436392700146e-07, |
| "loss": 78.4899, |
| "step": 6125 |
| }, |
| { |
| "epoch": 0.7348137492882616, |
| "grad_norm": 110.1875, |
| "learning_rate": 2.9465831890235776e-07, |
| "loss": 79.3901, |
| "step": 6130 |
| }, |
| { |
| "epoch": 0.7354131079744674, |
| "grad_norm": 105.875, |
| "learning_rate": 2.9399227387771416e-07, |
| "loss": 79.4775, |
| "step": 6135 |
| }, |
| { |
| "epoch": 0.7360124666606731, |
| "grad_norm": 108.625, |
| "learning_rate": 2.9332622885307046e-07, |
| "loss": 78.8485, |
| "step": 6140 |
| }, |
| { |
| "epoch": 0.7366118253468789, |
| "grad_norm": 106.125, |
| "learning_rate": 2.9266018382842676e-07, |
| "loss": 78.7774, |
| "step": 6145 |
| }, |
| { |
| "epoch": 0.7372111840330846, |
| "grad_norm": 101.625, |
| "learning_rate": 2.919941388037831e-07, |
| "loss": 77.9222, |
| "step": 6150 |
| }, |
| { |
| "epoch": 0.7378105427192904, |
| "grad_norm": 106.0625, |
| "learning_rate": 2.9132809377913946e-07, |
| "loss": 78.3709, |
| "step": 6155 |
| }, |
| { |
| "epoch": 0.7384099014054961, |
| "grad_norm": 105.125, |
| "learning_rate": 2.906620487544958e-07, |
| "loss": 77.9696, |
| "step": 6160 |
| }, |
| { |
| "epoch": 0.7390092600917019, |
| "grad_norm": 106.5, |
| "learning_rate": 2.899960037298521e-07, |
| "loss": 78.3595, |
| "step": 6165 |
| }, |
| { |
| "epoch": 0.7396086187779076, |
| "grad_norm": 104.6875, |
| "learning_rate": 2.893299587052084e-07, |
| "loss": 76.3922, |
| "step": 6170 |
| }, |
| { |
| "epoch": 0.7402079774641134, |
| "grad_norm": 105.25, |
| "learning_rate": 2.886639136805648e-07, |
| "loss": 78.9993, |
| "step": 6175 |
| }, |
| { |
| "epoch": 0.7408073361503191, |
| "grad_norm": 103.6875, |
| "learning_rate": 2.879978686559211e-07, |
| "loss": 78.1283, |
| "step": 6180 |
| }, |
| { |
| "epoch": 0.7414066948365249, |
| "grad_norm": 105.5, |
| "learning_rate": 2.8733182363127747e-07, |
| "loss": 78.5354, |
| "step": 6185 |
| }, |
| { |
| "epoch": 0.7420060535227306, |
| "grad_norm": 111.375, |
| "learning_rate": 2.8666577860663377e-07, |
| "loss": 78.5119, |
| "step": 6190 |
| }, |
| { |
| "epoch": 0.7426054122089364, |
| "grad_norm": 104.75, |
| "learning_rate": 2.859997335819901e-07, |
| "loss": 79.4887, |
| "step": 6195 |
| }, |
| { |
| "epoch": 0.7432047708951423, |
| "grad_norm": 105.1875, |
| "learning_rate": 2.853336885573465e-07, |
| "loss": 77.8327, |
| "step": 6200 |
| }, |
| { |
| "epoch": 0.743804129581348, |
| "grad_norm": 108.75, |
| "learning_rate": 2.8466764353270277e-07, |
| "loss": 78.3946, |
| "step": 6205 |
| }, |
| { |
| "epoch": 0.7444034882675538, |
| "grad_norm": 105.875, |
| "learning_rate": 2.840015985080592e-07, |
| "loss": 79.9381, |
| "step": 6210 |
| }, |
| { |
| "epoch": 0.7450028469537595, |
| "grad_norm": 105.6875, |
| "learning_rate": 2.833355534834155e-07, |
| "loss": 79.5875, |
| "step": 6215 |
| }, |
| { |
| "epoch": 0.7456022056399653, |
| "grad_norm": 102.9375, |
| "learning_rate": 2.826695084587718e-07, |
| "loss": 77.8823, |
| "step": 6220 |
| }, |
| { |
| "epoch": 0.746201564326171, |
| "grad_norm": 106.875, |
| "learning_rate": 2.8200346343412813e-07, |
| "loss": 78.1676, |
| "step": 6225 |
| }, |
| { |
| "epoch": 0.7468009230123768, |
| "grad_norm": 106.0, |
| "learning_rate": 2.813374184094845e-07, |
| "loss": 77.9816, |
| "step": 6230 |
| }, |
| { |
| "epoch": 0.7474002816985825, |
| "grad_norm": 106.3125, |
| "learning_rate": 2.8067137338484083e-07, |
| "loss": 78.3035, |
| "step": 6235 |
| }, |
| { |
| "epoch": 0.7479996403847883, |
| "grad_norm": 106.5, |
| "learning_rate": 2.8000532836019713e-07, |
| "loss": 80.3571, |
| "step": 6240 |
| }, |
| { |
| "epoch": 0.748598999070994, |
| "grad_norm": 106.5625, |
| "learning_rate": 2.7933928333555343e-07, |
| "loss": 79.3642, |
| "step": 6245 |
| }, |
| { |
| "epoch": 0.7491983577571998, |
| "grad_norm": 105.25, |
| "learning_rate": 2.7867323831090984e-07, |
| "loss": 77.0638, |
| "step": 6250 |
| }, |
| { |
| "epoch": 0.7497977164434055, |
| "grad_norm": 109.125, |
| "learning_rate": 2.7800719328626613e-07, |
| "loss": 78.2273, |
| "step": 6255 |
| }, |
| { |
| "epoch": 0.7503970751296113, |
| "grad_norm": 103.125, |
| "learning_rate": 2.773411482616225e-07, |
| "loss": 79.4584, |
| "step": 6260 |
| }, |
| { |
| "epoch": 0.750996433815817, |
| "grad_norm": 105.6875, |
| "learning_rate": 2.766751032369788e-07, |
| "loss": 77.9623, |
| "step": 6265 |
| }, |
| { |
| "epoch": 0.7515957925020228, |
| "grad_norm": 108.8125, |
| "learning_rate": 2.760090582123352e-07, |
| "loss": 77.3997, |
| "step": 6270 |
| }, |
| { |
| "epoch": 0.7521951511882286, |
| "grad_norm": 108.25, |
| "learning_rate": 2.753430131876915e-07, |
| "loss": 78.4736, |
| "step": 6275 |
| }, |
| { |
| "epoch": 0.7527945098744343, |
| "grad_norm": 108.6875, |
| "learning_rate": 2.746769681630478e-07, |
| "loss": 77.6046, |
| "step": 6280 |
| }, |
| { |
| "epoch": 0.7533938685606402, |
| "grad_norm": 107.25, |
| "learning_rate": 2.7401092313840414e-07, |
| "loss": 79.3138, |
| "step": 6285 |
| }, |
| { |
| "epoch": 0.7539932272468459, |
| "grad_norm": 106.4375, |
| "learning_rate": 2.733448781137605e-07, |
| "loss": 78.1768, |
| "step": 6290 |
| }, |
| { |
| "epoch": 0.7545925859330517, |
| "grad_norm": 108.25, |
| "learning_rate": 2.7267883308911685e-07, |
| "loss": 79.0016, |
| "step": 6295 |
| }, |
| { |
| "epoch": 0.7551919446192574, |
| "grad_norm": 103.9375, |
| "learning_rate": 2.7201278806447314e-07, |
| "loss": 78.4366, |
| "step": 6300 |
| }, |
| { |
| "epoch": 0.7557913033054632, |
| "grad_norm": 109.75, |
| "learning_rate": 2.7134674303982944e-07, |
| "loss": 78.4717, |
| "step": 6305 |
| }, |
| { |
| "epoch": 0.7563906619916689, |
| "grad_norm": 105.25, |
| "learning_rate": 2.7068069801518585e-07, |
| "loss": 77.4006, |
| "step": 6310 |
| }, |
| { |
| "epoch": 0.7569900206778747, |
| "grad_norm": 108.1875, |
| "learning_rate": 2.7001465299054215e-07, |
| "loss": 79.0828, |
| "step": 6315 |
| }, |
| { |
| "epoch": 0.7575893793640804, |
| "grad_norm": 106.4375, |
| "learning_rate": 2.693486079658985e-07, |
| "loss": 78.3332, |
| "step": 6320 |
| }, |
| { |
| "epoch": 0.7581887380502862, |
| "grad_norm": 106.25, |
| "learning_rate": 2.686825629412548e-07, |
| "loss": 79.0802, |
| "step": 6325 |
| }, |
| { |
| "epoch": 0.758788096736492, |
| "grad_norm": 106.1875, |
| "learning_rate": 2.6801651791661115e-07, |
| "loss": 77.4329, |
| "step": 6330 |
| }, |
| { |
| "epoch": 0.7593874554226977, |
| "grad_norm": 103.4375, |
| "learning_rate": 2.673504728919675e-07, |
| "loss": 77.6086, |
| "step": 6335 |
| }, |
| { |
| "epoch": 0.7599868141089035, |
| "grad_norm": 107.75, |
| "learning_rate": 2.666844278673238e-07, |
| "loss": 77.9435, |
| "step": 6340 |
| }, |
| { |
| "epoch": 0.7605861727951092, |
| "grad_norm": 108.5, |
| "learning_rate": 2.6601838284268015e-07, |
| "loss": 78.1191, |
| "step": 6345 |
| }, |
| { |
| "epoch": 0.761185531481315, |
| "grad_norm": 107.0, |
| "learning_rate": 2.653523378180365e-07, |
| "loss": 77.3275, |
| "step": 6350 |
| }, |
| { |
| "epoch": 0.7617848901675207, |
| "grad_norm": 105.0625, |
| "learning_rate": 2.646862927933928e-07, |
| "loss": 78.2275, |
| "step": 6355 |
| }, |
| { |
| "epoch": 0.7623842488537265, |
| "grad_norm": 107.3125, |
| "learning_rate": 2.6402024776874916e-07, |
| "loss": 77.8997, |
| "step": 6360 |
| }, |
| { |
| "epoch": 0.7629836075399322, |
| "grad_norm": 111.0, |
| "learning_rate": 2.6335420274410546e-07, |
| "loss": 78.2208, |
| "step": 6365 |
| }, |
| { |
| "epoch": 0.7635829662261381, |
| "grad_norm": 109.375, |
| "learning_rate": 2.6268815771946186e-07, |
| "loss": 80.8852, |
| "step": 6370 |
| }, |
| { |
| "epoch": 0.7641823249123438, |
| "grad_norm": 104.0625, |
| "learning_rate": 2.6202211269481816e-07, |
| "loss": 77.9402, |
| "step": 6375 |
| }, |
| { |
| "epoch": 0.7647816835985496, |
| "grad_norm": 106.3125, |
| "learning_rate": 2.6135606767017446e-07, |
| "loss": 79.3084, |
| "step": 6380 |
| }, |
| { |
| "epoch": 0.7653810422847553, |
| "grad_norm": 108.6875, |
| "learning_rate": 2.606900226455308e-07, |
| "loss": 78.8786, |
| "step": 6385 |
| }, |
| { |
| "epoch": 0.7659804009709611, |
| "grad_norm": 109.125, |
| "learning_rate": 2.6002397762088716e-07, |
| "loss": 78.2904, |
| "step": 6390 |
| }, |
| { |
| "epoch": 0.7665797596571668, |
| "grad_norm": 104.5, |
| "learning_rate": 2.593579325962435e-07, |
| "loss": 77.6137, |
| "step": 6395 |
| }, |
| { |
| "epoch": 0.7671791183433726, |
| "grad_norm": 106.25, |
| "learning_rate": 2.586918875715998e-07, |
| "loss": 78.5451, |
| "step": 6400 |
| }, |
| { |
| "epoch": 0.7677784770295784, |
| "grad_norm": 106.375, |
| "learning_rate": 2.5802584254695617e-07, |
| "loss": 79.5644, |
| "step": 6405 |
| }, |
| { |
| "epoch": 0.7683778357157841, |
| "grad_norm": 103.8125, |
| "learning_rate": 2.573597975223125e-07, |
| "loss": 77.4564, |
| "step": 6410 |
| }, |
| { |
| "epoch": 0.7689771944019899, |
| "grad_norm": 105.3125, |
| "learning_rate": 2.566937524976688e-07, |
| "loss": 77.9053, |
| "step": 6415 |
| }, |
| { |
| "epoch": 0.7695765530881956, |
| "grad_norm": 106.5, |
| "learning_rate": 2.5602770747302517e-07, |
| "loss": 79.797, |
| "step": 6420 |
| }, |
| { |
| "epoch": 0.7701759117744014, |
| "grad_norm": 107.125, |
| "learning_rate": 2.5536166244838147e-07, |
| "loss": 77.8786, |
| "step": 6425 |
| }, |
| { |
| "epoch": 0.7707752704606071, |
| "grad_norm": 106.25, |
| "learning_rate": 2.546956174237379e-07, |
| "loss": 79.659, |
| "step": 6430 |
| }, |
| { |
| "epoch": 0.7713746291468129, |
| "grad_norm": 103.875, |
| "learning_rate": 2.5402957239909417e-07, |
| "loss": 77.1865, |
| "step": 6435 |
| }, |
| { |
| "epoch": 0.7719739878330186, |
| "grad_norm": 108.8125, |
| "learning_rate": 2.5336352737445047e-07, |
| "loss": 77.6755, |
| "step": 6440 |
| }, |
| { |
| "epoch": 0.7725733465192244, |
| "grad_norm": 106.625, |
| "learning_rate": 2.526974823498068e-07, |
| "loss": 79.6433, |
| "step": 6445 |
| }, |
| { |
| "epoch": 0.7731727052054302, |
| "grad_norm": 106.8125, |
| "learning_rate": 2.520314373251632e-07, |
| "loss": 78.4122, |
| "step": 6450 |
| }, |
| { |
| "epoch": 0.773772063891636, |
| "grad_norm": 107.4375, |
| "learning_rate": 2.5136539230051953e-07, |
| "loss": 77.1019, |
| "step": 6455 |
| }, |
| { |
| "epoch": 0.7743714225778418, |
| "grad_norm": 105.1875, |
| "learning_rate": 2.5069934727587583e-07, |
| "loss": 79.4794, |
| "step": 6460 |
| }, |
| { |
| "epoch": 0.7749707812640475, |
| "grad_norm": 106.5, |
| "learning_rate": 2.500333022512321e-07, |
| "loss": 77.2468, |
| "step": 6465 |
| }, |
| { |
| "epoch": 0.7755701399502533, |
| "grad_norm": 103.5625, |
| "learning_rate": 2.493672572265885e-07, |
| "loss": 78.32, |
| "step": 6470 |
| }, |
| { |
| "epoch": 0.776169498636459, |
| "grad_norm": 107.0625, |
| "learning_rate": 2.4870121220194483e-07, |
| "loss": 78.8655, |
| "step": 6475 |
| }, |
| { |
| "epoch": 0.7767688573226648, |
| "grad_norm": 106.75, |
| "learning_rate": 2.480351671773012e-07, |
| "loss": 77.82, |
| "step": 6480 |
| }, |
| { |
| "epoch": 0.7773682160088705, |
| "grad_norm": 107.125, |
| "learning_rate": 2.4736912215265753e-07, |
| "loss": 77.6949, |
| "step": 6485 |
| }, |
| { |
| "epoch": 0.7779675746950763, |
| "grad_norm": 104.75, |
| "learning_rate": 2.4670307712801383e-07, |
| "loss": 77.4096, |
| "step": 6490 |
| }, |
| { |
| "epoch": 0.778566933381282, |
| "grad_norm": 108.875, |
| "learning_rate": 2.460370321033702e-07, |
| "loss": 78.1701, |
| "step": 6495 |
| }, |
| { |
| "epoch": 0.7791662920674878, |
| "grad_norm": 106.8125, |
| "learning_rate": 2.453709870787265e-07, |
| "loss": 77.7304, |
| "step": 6500 |
| }, |
| { |
| "epoch": 0.7791662920674878, |
| "eval_loss": 2.4339072704315186, |
| "eval_runtime": 403.9267, |
| "eval_samples_per_second": 1113.081, |
| "eval_steps_per_second": 34.786, |
| "step": 6500 |
| }, |
| { |
| "epoch": 0.7797656507536935, |
| "grad_norm": 107.1875, |
| "learning_rate": 2.4470494205408284e-07, |
| "loss": 78.4344, |
| "step": 6505 |
| }, |
| { |
| "epoch": 0.7803650094398993, |
| "grad_norm": 106.6875, |
| "learning_rate": 2.440388970294392e-07, |
| "loss": 78.3033, |
| "step": 6510 |
| }, |
| { |
| "epoch": 0.780964368126105, |
| "grad_norm": 104.375, |
| "learning_rate": 2.4337285200479554e-07, |
| "loss": 77.7646, |
| "step": 6515 |
| }, |
| { |
| "epoch": 0.7815637268123108, |
| "grad_norm": 107.875, |
| "learning_rate": 2.4270680698015184e-07, |
| "loss": 77.9291, |
| "step": 6520 |
| }, |
| { |
| "epoch": 0.7821630854985165, |
| "grad_norm": 109.0, |
| "learning_rate": 2.420407619555082e-07, |
| "loss": 78.5846, |
| "step": 6525 |
| }, |
| { |
| "epoch": 0.7827624441847223, |
| "grad_norm": 108.6875, |
| "learning_rate": 2.413747169308645e-07, |
| "loss": 77.4087, |
| "step": 6530 |
| }, |
| { |
| "epoch": 0.7833618028709282, |
| "grad_norm": 107.625, |
| "learning_rate": 2.4070867190622084e-07, |
| "loss": 78.0319, |
| "step": 6535 |
| }, |
| { |
| "epoch": 0.7839611615571339, |
| "grad_norm": 109.625, |
| "learning_rate": 2.400426268815772e-07, |
| "loss": 77.3543, |
| "step": 6540 |
| }, |
| { |
| "epoch": 0.7845605202433397, |
| "grad_norm": 102.375, |
| "learning_rate": 2.393765818569335e-07, |
| "loss": 77.9248, |
| "step": 6545 |
| }, |
| { |
| "epoch": 0.7851598789295454, |
| "grad_norm": 106.6875, |
| "learning_rate": 2.3871053683228985e-07, |
| "loss": 77.9559, |
| "step": 6550 |
| }, |
| { |
| "epoch": 0.7857592376157512, |
| "grad_norm": 104.5625, |
| "learning_rate": 2.380444918076462e-07, |
| "loss": 78.4013, |
| "step": 6555 |
| }, |
| { |
| "epoch": 0.7863585963019569, |
| "grad_norm": 106.6875, |
| "learning_rate": 2.3737844678300252e-07, |
| "loss": 77.4872, |
| "step": 6560 |
| }, |
| { |
| "epoch": 0.7869579549881627, |
| "grad_norm": 105.0, |
| "learning_rate": 2.3671240175835885e-07, |
| "loss": 77.9336, |
| "step": 6565 |
| }, |
| { |
| "epoch": 0.7875573136743684, |
| "grad_norm": 108.75, |
| "learning_rate": 2.360463567337152e-07, |
| "loss": 77.6178, |
| "step": 6570 |
| }, |
| { |
| "epoch": 0.7881566723605742, |
| "grad_norm": 106.9375, |
| "learning_rate": 2.3538031170907153e-07, |
| "loss": 77.6397, |
| "step": 6575 |
| }, |
| { |
| "epoch": 0.7887560310467799, |
| "grad_norm": 102.875, |
| "learning_rate": 2.3471426668442788e-07, |
| "loss": 77.6935, |
| "step": 6580 |
| }, |
| { |
| "epoch": 0.7893553897329857, |
| "grad_norm": 102.5625, |
| "learning_rate": 2.3404822165978418e-07, |
| "loss": 78.3942, |
| "step": 6585 |
| }, |
| { |
| "epoch": 0.7899547484191914, |
| "grad_norm": 108.4375, |
| "learning_rate": 2.3338217663514053e-07, |
| "loss": 77.9903, |
| "step": 6590 |
| }, |
| { |
| "epoch": 0.7905541071053972, |
| "grad_norm": 105.0625, |
| "learning_rate": 2.3271613161049686e-07, |
| "loss": 76.4352, |
| "step": 6595 |
| }, |
| { |
| "epoch": 0.791153465791603, |
| "grad_norm": 104.5625, |
| "learning_rate": 2.320500865858532e-07, |
| "loss": 77.667, |
| "step": 6600 |
| }, |
| { |
| "epoch": 0.7917528244778087, |
| "grad_norm": 107.4375, |
| "learning_rate": 2.3138404156120953e-07, |
| "loss": 77.5867, |
| "step": 6605 |
| }, |
| { |
| "epoch": 0.7923521831640145, |
| "grad_norm": 105.0, |
| "learning_rate": 2.3071799653656589e-07, |
| "loss": 77.8278, |
| "step": 6610 |
| }, |
| { |
| "epoch": 0.7929515418502202, |
| "grad_norm": 104.625, |
| "learning_rate": 2.3005195151192218e-07, |
| "loss": 77.053, |
| "step": 6615 |
| }, |
| { |
| "epoch": 0.7935509005364261, |
| "grad_norm": 105.9375, |
| "learning_rate": 2.2938590648727854e-07, |
| "loss": 78.3603, |
| "step": 6620 |
| }, |
| { |
| "epoch": 0.7941502592226318, |
| "grad_norm": 105.6875, |
| "learning_rate": 2.2871986146263486e-07, |
| "loss": 76.1169, |
| "step": 6625 |
| }, |
| { |
| "epoch": 0.7947496179088376, |
| "grad_norm": 107.125, |
| "learning_rate": 2.2805381643799121e-07, |
| "loss": 77.0539, |
| "step": 6630 |
| }, |
| { |
| "epoch": 0.7953489765950433, |
| "grad_norm": 105.6875, |
| "learning_rate": 2.2738777141334754e-07, |
| "loss": 77.8971, |
| "step": 6635 |
| }, |
| { |
| "epoch": 0.7959483352812491, |
| "grad_norm": 102.8125, |
| "learning_rate": 2.2672172638870387e-07, |
| "loss": 76.208, |
| "step": 6640 |
| }, |
| { |
| "epoch": 0.7965476939674548, |
| "grad_norm": 107.1875, |
| "learning_rate": 2.260556813640602e-07, |
| "loss": 77.1146, |
| "step": 6645 |
| }, |
| { |
| "epoch": 0.7971470526536606, |
| "grad_norm": 106.125, |
| "learning_rate": 2.2538963633941654e-07, |
| "loss": 77.5436, |
| "step": 6650 |
| }, |
| { |
| "epoch": 0.7977464113398663, |
| "grad_norm": 104.75, |
| "learning_rate": 2.2472359131477287e-07, |
| "loss": 77.1789, |
| "step": 6655 |
| }, |
| { |
| "epoch": 0.7983457700260721, |
| "grad_norm": 107.25, |
| "learning_rate": 2.2405754629012922e-07, |
| "loss": 76.5571, |
| "step": 6660 |
| }, |
| { |
| "epoch": 0.7989451287122779, |
| "grad_norm": 107.75, |
| "learning_rate": 2.2339150126548552e-07, |
| "loss": 77.4697, |
| "step": 6665 |
| }, |
| { |
| "epoch": 0.7995444873984836, |
| "grad_norm": 103.9375, |
| "learning_rate": 2.2272545624084187e-07, |
| "loss": 78.0711, |
| "step": 6670 |
| }, |
| { |
| "epoch": 0.8001438460846894, |
| "grad_norm": 107.25, |
| "learning_rate": 2.220594112161982e-07, |
| "loss": 77.3697, |
| "step": 6675 |
| }, |
| { |
| "epoch": 0.8007432047708951, |
| "grad_norm": 104.1875, |
| "learning_rate": 2.2139336619155455e-07, |
| "loss": 77.4097, |
| "step": 6680 |
| }, |
| { |
| "epoch": 0.8013425634571009, |
| "grad_norm": 106.0, |
| "learning_rate": 2.2072732116691087e-07, |
| "loss": 77.1239, |
| "step": 6685 |
| }, |
| { |
| "epoch": 0.8019419221433066, |
| "grad_norm": 107.0625, |
| "learning_rate": 2.2006127614226723e-07, |
| "loss": 77.2553, |
| "step": 6690 |
| }, |
| { |
| "epoch": 0.8025412808295124, |
| "grad_norm": 107.4375, |
| "learning_rate": 2.1939523111762353e-07, |
| "loss": 78.1408, |
| "step": 6695 |
| }, |
| { |
| "epoch": 0.8031406395157182, |
| "grad_norm": 109.5, |
| "learning_rate": 2.1872918609297988e-07, |
| "loss": 78.2807, |
| "step": 6700 |
| }, |
| { |
| "epoch": 0.803739998201924, |
| "grad_norm": 106.75, |
| "learning_rate": 2.180631410683362e-07, |
| "loss": 76.696, |
| "step": 6705 |
| }, |
| { |
| "epoch": 0.8043393568881297, |
| "grad_norm": 108.875, |
| "learning_rate": 2.1739709604369256e-07, |
| "loss": 77.1577, |
| "step": 6710 |
| }, |
| { |
| "epoch": 0.8049387155743355, |
| "grad_norm": 107.0, |
| "learning_rate": 2.1673105101904888e-07, |
| "loss": 77.7407, |
| "step": 6715 |
| }, |
| { |
| "epoch": 0.8055380742605412, |
| "grad_norm": 108.0, |
| "learning_rate": 2.160650059944052e-07, |
| "loss": 78.0718, |
| "step": 6720 |
| }, |
| { |
| "epoch": 0.806137432946747, |
| "grad_norm": 106.6875, |
| "learning_rate": 2.1539896096976153e-07, |
| "loss": 77.663, |
| "step": 6725 |
| }, |
| { |
| "epoch": 0.8067367916329528, |
| "grad_norm": 108.375, |
| "learning_rate": 2.1473291594511788e-07, |
| "loss": 78.4291, |
| "step": 6730 |
| }, |
| { |
| "epoch": 0.8073361503191585, |
| "grad_norm": 109.375, |
| "learning_rate": 2.140668709204742e-07, |
| "loss": 77.1584, |
| "step": 6735 |
| }, |
| { |
| "epoch": 0.8079355090053643, |
| "grad_norm": 105.3125, |
| "learning_rate": 2.1340082589583056e-07, |
| "loss": 76.6091, |
| "step": 6740 |
| }, |
| { |
| "epoch": 0.80853486769157, |
| "grad_norm": 106.625, |
| "learning_rate": 2.127347808711869e-07, |
| "loss": 76.3898, |
| "step": 6745 |
| }, |
| { |
| "epoch": 0.8091342263777758, |
| "grad_norm": 107.3125, |
| "learning_rate": 2.120687358465432e-07, |
| "loss": 76.942, |
| "step": 6750 |
| }, |
| { |
| "epoch": 0.8097335850639815, |
| "grad_norm": 106.5625, |
| "learning_rate": 2.1140269082189954e-07, |
| "loss": 76.9349, |
| "step": 6755 |
| }, |
| { |
| "epoch": 0.8103329437501873, |
| "grad_norm": 105.375, |
| "learning_rate": 2.107366457972559e-07, |
| "loss": 77.6708, |
| "step": 6760 |
| }, |
| { |
| "epoch": 0.810932302436393, |
| "grad_norm": 105.8125, |
| "learning_rate": 2.1007060077261222e-07, |
| "loss": 78.7127, |
| "step": 6765 |
| }, |
| { |
| "epoch": 0.8115316611225988, |
| "grad_norm": 105.0, |
| "learning_rate": 2.0940455574796857e-07, |
| "loss": 76.8139, |
| "step": 6770 |
| }, |
| { |
| "epoch": 0.8121310198088045, |
| "grad_norm": 106.9375, |
| "learning_rate": 2.0873851072332487e-07, |
| "loss": 77.4944, |
| "step": 6775 |
| }, |
| { |
| "epoch": 0.8127303784950103, |
| "grad_norm": 105.9375, |
| "learning_rate": 2.0807246569868122e-07, |
| "loss": 77.3446, |
| "step": 6780 |
| }, |
| { |
| "epoch": 0.8133297371812162, |
| "grad_norm": 106.375, |
| "learning_rate": 2.0740642067403754e-07, |
| "loss": 76.7164, |
| "step": 6785 |
| }, |
| { |
| "epoch": 0.8139290958674219, |
| "grad_norm": 105.0, |
| "learning_rate": 2.067403756493939e-07, |
| "loss": 76.5707, |
| "step": 6790 |
| }, |
| { |
| "epoch": 0.8145284545536277, |
| "grad_norm": 109.625, |
| "learning_rate": 2.0607433062475022e-07, |
| "loss": 78.0642, |
| "step": 6795 |
| }, |
| { |
| "epoch": 0.8151278132398334, |
| "grad_norm": 105.5625, |
| "learning_rate": 2.0540828560010657e-07, |
| "loss": 76.432, |
| "step": 6800 |
| }, |
| { |
| "epoch": 0.8157271719260392, |
| "grad_norm": 106.4375, |
| "learning_rate": 2.0474224057546287e-07, |
| "loss": 77.7618, |
| "step": 6805 |
| }, |
| { |
| "epoch": 0.8163265306122449, |
| "grad_norm": 108.625, |
| "learning_rate": 2.0407619555081923e-07, |
| "loss": 77.1692, |
| "step": 6810 |
| }, |
| { |
| "epoch": 0.8169258892984507, |
| "grad_norm": 108.6875, |
| "learning_rate": 2.0341015052617555e-07, |
| "loss": 77.461, |
| "step": 6815 |
| }, |
| { |
| "epoch": 0.8175252479846564, |
| "grad_norm": 106.3125, |
| "learning_rate": 2.027441055015319e-07, |
| "loss": 77.479, |
| "step": 6820 |
| }, |
| { |
| "epoch": 0.8181246066708622, |
| "grad_norm": 108.25, |
| "learning_rate": 2.0207806047688823e-07, |
| "loss": 77.7447, |
| "step": 6825 |
| }, |
| { |
| "epoch": 0.8187239653570679, |
| "grad_norm": 107.25, |
| "learning_rate": 2.0141201545224455e-07, |
| "loss": 78.9139, |
| "step": 6830 |
| }, |
| { |
| "epoch": 0.8193233240432737, |
| "grad_norm": 105.25, |
| "learning_rate": 2.0074597042760088e-07, |
| "loss": 76.8411, |
| "step": 6835 |
| }, |
| { |
| "epoch": 0.8199226827294794, |
| "grad_norm": 109.1875, |
| "learning_rate": 2.0007992540295723e-07, |
| "loss": 77.406, |
| "step": 6840 |
| }, |
| { |
| "epoch": 0.8205220414156852, |
| "grad_norm": 110.4375, |
| "learning_rate": 1.9941388037831356e-07, |
| "loss": 77.1655, |
| "step": 6845 |
| }, |
| { |
| "epoch": 0.8211214001018909, |
| "grad_norm": 107.1875, |
| "learning_rate": 1.987478353536699e-07, |
| "loss": 76.9701, |
| "step": 6850 |
| }, |
| { |
| "epoch": 0.8217207587880967, |
| "grad_norm": 104.0625, |
| "learning_rate": 1.980817903290262e-07, |
| "loss": 76.9663, |
| "step": 6855 |
| }, |
| { |
| "epoch": 0.8223201174743024, |
| "grad_norm": 106.125, |
| "learning_rate": 1.9741574530438256e-07, |
| "loss": 77.8956, |
| "step": 6860 |
| }, |
| { |
| "epoch": 0.8229194761605082, |
| "grad_norm": 103.25, |
| "learning_rate": 1.9674970027973889e-07, |
| "loss": 77.8959, |
| "step": 6865 |
| }, |
| { |
| "epoch": 0.8235188348467141, |
| "grad_norm": 106.125, |
| "learning_rate": 1.9608365525509524e-07, |
| "loss": 76.7267, |
| "step": 6870 |
| }, |
| { |
| "epoch": 0.8241181935329198, |
| "grad_norm": 107.0625, |
| "learning_rate": 1.9541761023045156e-07, |
| "loss": 76.7561, |
| "step": 6875 |
| }, |
| { |
| "epoch": 0.8247175522191256, |
| "grad_norm": 107.0625, |
| "learning_rate": 1.9475156520580792e-07, |
| "loss": 77.7667, |
| "step": 6880 |
| }, |
| { |
| "epoch": 0.8253169109053313, |
| "grad_norm": 106.3125, |
| "learning_rate": 1.9408552018116421e-07, |
| "loss": 76.7068, |
| "step": 6885 |
| }, |
| { |
| "epoch": 0.8259162695915371, |
| "grad_norm": 105.6875, |
| "learning_rate": 1.9341947515652057e-07, |
| "loss": 77.1725, |
| "step": 6890 |
| }, |
| { |
| "epoch": 0.8265156282777428, |
| "grad_norm": 107.125, |
| "learning_rate": 1.927534301318769e-07, |
| "loss": 76.7159, |
| "step": 6895 |
| }, |
| { |
| "epoch": 0.8271149869639486, |
| "grad_norm": 111.75, |
| "learning_rate": 1.9208738510723324e-07, |
| "loss": 76.9128, |
| "step": 6900 |
| }, |
| { |
| "epoch": 0.8277143456501543, |
| "grad_norm": 109.1875, |
| "learning_rate": 1.9142134008258957e-07, |
| "loss": 77.1265, |
| "step": 6905 |
| }, |
| { |
| "epoch": 0.8283137043363601, |
| "grad_norm": 105.4375, |
| "learning_rate": 1.907552950579459e-07, |
| "loss": 76.9765, |
| "step": 6910 |
| }, |
| { |
| "epoch": 0.8289130630225658, |
| "grad_norm": 106.5625, |
| "learning_rate": 1.9008925003330225e-07, |
| "loss": 76.0758, |
| "step": 6915 |
| }, |
| { |
| "epoch": 0.8295124217087716, |
| "grad_norm": 108.625, |
| "learning_rate": 1.8942320500865857e-07, |
| "loss": 77.2638, |
| "step": 6920 |
| }, |
| { |
| "epoch": 0.8301117803949773, |
| "grad_norm": 102.5, |
| "learning_rate": 1.8875715998401493e-07, |
| "loss": 76.5221, |
| "step": 6925 |
| }, |
| { |
| "epoch": 0.8307111390811831, |
| "grad_norm": 104.5, |
| "learning_rate": 1.8809111495937125e-07, |
| "loss": 77.0011, |
| "step": 6930 |
| }, |
| { |
| "epoch": 0.8313104977673889, |
| "grad_norm": 107.25, |
| "learning_rate": 1.874250699347276e-07, |
| "loss": 77.0331, |
| "step": 6935 |
| }, |
| { |
| "epoch": 0.8319098564535946, |
| "grad_norm": 106.1875, |
| "learning_rate": 1.867590249100839e-07, |
| "loss": 77.2594, |
| "step": 6940 |
| }, |
| { |
| "epoch": 0.8325092151398004, |
| "grad_norm": 106.125, |
| "learning_rate": 1.8609297988544025e-07, |
| "loss": 76.9138, |
| "step": 6945 |
| }, |
| { |
| "epoch": 0.8331085738260062, |
| "grad_norm": 104.75, |
| "learning_rate": 1.8542693486079658e-07, |
| "loss": 76.312, |
| "step": 6950 |
| }, |
| { |
| "epoch": 0.833707932512212, |
| "grad_norm": 107.6875, |
| "learning_rate": 1.8476088983615293e-07, |
| "loss": 75.6801, |
| "step": 6955 |
| }, |
| { |
| "epoch": 0.8343072911984177, |
| "grad_norm": 102.125, |
| "learning_rate": 1.8409484481150926e-07, |
| "loss": 78.2077, |
| "step": 6960 |
| }, |
| { |
| "epoch": 0.8349066498846235, |
| "grad_norm": 107.5, |
| "learning_rate": 1.8342879978686558e-07, |
| "loss": 77.1983, |
| "step": 6965 |
| }, |
| { |
| "epoch": 0.8355060085708292, |
| "grad_norm": 106.1875, |
| "learning_rate": 1.827627547622219e-07, |
| "loss": 78.3859, |
| "step": 6970 |
| }, |
| { |
| "epoch": 0.836105367257035, |
| "grad_norm": 107.3125, |
| "learning_rate": 1.8209670973757826e-07, |
| "loss": 76.9793, |
| "step": 6975 |
| }, |
| { |
| "epoch": 0.8367047259432407, |
| "grad_norm": 109.5625, |
| "learning_rate": 1.8143066471293459e-07, |
| "loss": 75.6328, |
| "step": 6980 |
| }, |
| { |
| "epoch": 0.8373040846294465, |
| "grad_norm": 108.0625, |
| "learning_rate": 1.8076461968829094e-07, |
| "loss": 76.4836, |
| "step": 6985 |
| }, |
| { |
| "epoch": 0.8379034433156523, |
| "grad_norm": 107.0, |
| "learning_rate": 1.8009857466364726e-07, |
| "loss": 76.2991, |
| "step": 6990 |
| }, |
| { |
| "epoch": 0.838502802001858, |
| "grad_norm": 107.4375, |
| "learning_rate": 1.794325296390036e-07, |
| "loss": 75.9192, |
| "step": 6995 |
| }, |
| { |
| "epoch": 0.8391021606880638, |
| "grad_norm": 108.4375, |
| "learning_rate": 1.7876648461435991e-07, |
| "loss": 77.1332, |
| "step": 7000 |
| }, |
| { |
| "epoch": 0.8391021606880638, |
| "eval_loss": 2.4002504348754883, |
| "eval_runtime": 400.1746, |
| "eval_samples_per_second": 1123.517, |
| "eval_steps_per_second": 35.112, |
| "step": 7000 |
| }, |
| { |
| "epoch": 0.8397015193742695, |
| "grad_norm": 105.375, |
| "learning_rate": 1.7810043958971627e-07, |
| "loss": 76.3652, |
| "step": 7005 |
| }, |
| { |
| "epoch": 0.8403008780604753, |
| "grad_norm": 107.0, |
| "learning_rate": 1.774343945650726e-07, |
| "loss": 76.7703, |
| "step": 7010 |
| }, |
| { |
| "epoch": 0.840900236746681, |
| "grad_norm": 107.0, |
| "learning_rate": 1.7676834954042894e-07, |
| "loss": 75.7328, |
| "step": 7015 |
| }, |
| { |
| "epoch": 0.8414995954328868, |
| "grad_norm": 106.3125, |
| "learning_rate": 1.7610230451578524e-07, |
| "loss": 76.7758, |
| "step": 7020 |
| }, |
| { |
| "epoch": 0.8420989541190925, |
| "grad_norm": 107.9375, |
| "learning_rate": 1.754362594911416e-07, |
| "loss": 77.1049, |
| "step": 7025 |
| }, |
| { |
| "epoch": 0.8426983128052983, |
| "grad_norm": 107.375, |
| "learning_rate": 1.7477021446649792e-07, |
| "loss": 77.5039, |
| "step": 7030 |
| }, |
| { |
| "epoch": 0.8432976714915041, |
| "grad_norm": 110.0, |
| "learning_rate": 1.7410416944185427e-07, |
| "loss": 77.7867, |
| "step": 7035 |
| }, |
| { |
| "epoch": 0.8438970301777099, |
| "grad_norm": 105.6875, |
| "learning_rate": 1.734381244172106e-07, |
| "loss": 76.1607, |
| "step": 7040 |
| }, |
| { |
| "epoch": 0.8444963888639156, |
| "grad_norm": 109.1875, |
| "learning_rate": 1.7277207939256695e-07, |
| "loss": 77.016, |
| "step": 7045 |
| }, |
| { |
| "epoch": 0.8450957475501214, |
| "grad_norm": 104.875, |
| "learning_rate": 1.7210603436792325e-07, |
| "loss": 75.7198, |
| "step": 7050 |
| }, |
| { |
| "epoch": 0.8456951062363272, |
| "grad_norm": 106.1875, |
| "learning_rate": 1.714399893432796e-07, |
| "loss": 75.8812, |
| "step": 7055 |
| }, |
| { |
| "epoch": 0.8462944649225329, |
| "grad_norm": 107.125, |
| "learning_rate": 1.7077394431863593e-07, |
| "loss": 75.089, |
| "step": 7060 |
| }, |
| { |
| "epoch": 0.8468938236087387, |
| "grad_norm": 107.5, |
| "learning_rate": 1.7010789929399228e-07, |
| "loss": 76.6488, |
| "step": 7065 |
| }, |
| { |
| "epoch": 0.8474931822949444, |
| "grad_norm": 103.3125, |
| "learning_rate": 1.694418542693486e-07, |
| "loss": 77.9913, |
| "step": 7070 |
| }, |
| { |
| "epoch": 0.8480925409811502, |
| "grad_norm": 106.5, |
| "learning_rate": 1.6877580924470493e-07, |
| "loss": 75.9304, |
| "step": 7075 |
| }, |
| { |
| "epoch": 0.8486918996673559, |
| "grad_norm": 107.5, |
| "learning_rate": 1.6810976422006126e-07, |
| "loss": 77.1558, |
| "step": 7080 |
| }, |
| { |
| "epoch": 0.8492912583535617, |
| "grad_norm": 110.4375, |
| "learning_rate": 1.674437191954176e-07, |
| "loss": 75.9749, |
| "step": 7085 |
| }, |
| { |
| "epoch": 0.8498906170397674, |
| "grad_norm": 103.75, |
| "learning_rate": 1.6677767417077393e-07, |
| "loss": 76.0139, |
| "step": 7090 |
| }, |
| { |
| "epoch": 0.8504899757259732, |
| "grad_norm": 106.9375, |
| "learning_rate": 1.6611162914613029e-07, |
| "loss": 76.1673, |
| "step": 7095 |
| }, |
| { |
| "epoch": 0.8510893344121789, |
| "grad_norm": 106.9375, |
| "learning_rate": 1.6544558412148658e-07, |
| "loss": 76.9603, |
| "step": 7100 |
| }, |
| { |
| "epoch": 0.8516886930983847, |
| "grad_norm": 105.125, |
| "learning_rate": 1.6477953909684294e-07, |
| "loss": 75.6582, |
| "step": 7105 |
| }, |
| { |
| "epoch": 0.8522880517845904, |
| "grad_norm": 104.0, |
| "learning_rate": 1.6411349407219926e-07, |
| "loss": 76.3708, |
| "step": 7110 |
| }, |
| { |
| "epoch": 0.8528874104707963, |
| "grad_norm": 107.0, |
| "learning_rate": 1.6344744904755561e-07, |
| "loss": 76.4872, |
| "step": 7115 |
| }, |
| { |
| "epoch": 0.8534867691570021, |
| "grad_norm": 108.875, |
| "learning_rate": 1.6278140402291194e-07, |
| "loss": 76.6676, |
| "step": 7120 |
| }, |
| { |
| "epoch": 0.8540861278432078, |
| "grad_norm": 105.5, |
| "learning_rate": 1.621153589982683e-07, |
| "loss": 76.4368, |
| "step": 7125 |
| }, |
| { |
| "epoch": 0.8546854865294136, |
| "grad_norm": 107.25, |
| "learning_rate": 1.614493139736246e-07, |
| "loss": 77.9496, |
| "step": 7130 |
| }, |
| { |
| "epoch": 0.8552848452156193, |
| "grad_norm": 107.0, |
| "learning_rate": 1.6078326894898094e-07, |
| "loss": 76.4692, |
| "step": 7135 |
| }, |
| { |
| "epoch": 0.8558842039018251, |
| "grad_norm": 106.8125, |
| "learning_rate": 1.6011722392433727e-07, |
| "loss": 76.6703, |
| "step": 7140 |
| }, |
| { |
| "epoch": 0.8564835625880308, |
| "grad_norm": 106.75, |
| "learning_rate": 1.5945117889969362e-07, |
| "loss": 77.6701, |
| "step": 7145 |
| }, |
| { |
| "epoch": 0.8570829212742366, |
| "grad_norm": 106.0625, |
| "learning_rate": 1.5878513387504995e-07, |
| "loss": 75.0485, |
| "step": 7150 |
| }, |
| { |
| "epoch": 0.8576822799604423, |
| "grad_norm": 103.0, |
| "learning_rate": 1.5811908885040627e-07, |
| "loss": 75.7822, |
| "step": 7155 |
| }, |
| { |
| "epoch": 0.8582816386466481, |
| "grad_norm": 107.0625, |
| "learning_rate": 1.574530438257626e-07, |
| "loss": 77.6251, |
| "step": 7160 |
| }, |
| { |
| "epoch": 0.8588809973328538, |
| "grad_norm": 104.3125, |
| "learning_rate": 1.5678699880111895e-07, |
| "loss": 75.9858, |
| "step": 7165 |
| }, |
| { |
| "epoch": 0.8594803560190596, |
| "grad_norm": 107.5625, |
| "learning_rate": 1.5612095377647528e-07, |
| "loss": 76.3392, |
| "step": 7170 |
| }, |
| { |
| "epoch": 0.8600797147052653, |
| "grad_norm": 108.375, |
| "learning_rate": 1.5545490875183163e-07, |
| "loss": 77.0148, |
| "step": 7175 |
| }, |
| { |
| "epoch": 0.8606790733914711, |
| "grad_norm": 107.8125, |
| "learning_rate": 1.5478886372718795e-07, |
| "loss": 76.0265, |
| "step": 7180 |
| }, |
| { |
| "epoch": 0.8612784320776768, |
| "grad_norm": 105.625, |
| "learning_rate": 1.5412281870254428e-07, |
| "loss": 76.7633, |
| "step": 7185 |
| }, |
| { |
| "epoch": 0.8618777907638826, |
| "grad_norm": 107.75, |
| "learning_rate": 1.534567736779006e-07, |
| "loss": 76.8605, |
| "step": 7190 |
| }, |
| { |
| "epoch": 0.8624771494500884, |
| "grad_norm": 106.875, |
| "learning_rate": 1.5279072865325696e-07, |
| "loss": 77.3806, |
| "step": 7195 |
| }, |
| { |
| "epoch": 0.8630765081362942, |
| "grad_norm": 105.5, |
| "learning_rate": 1.5212468362861328e-07, |
| "loss": 75.9591, |
| "step": 7200 |
| }, |
| { |
| "epoch": 0.8636758668225, |
| "grad_norm": 106.875, |
| "learning_rate": 1.5145863860396963e-07, |
| "loss": 76.9259, |
| "step": 7205 |
| }, |
| { |
| "epoch": 0.8642752255087057, |
| "grad_norm": 110.0, |
| "learning_rate": 1.5079259357932593e-07, |
| "loss": 77.0442, |
| "step": 7210 |
| }, |
| { |
| "epoch": 0.8648745841949115, |
| "grad_norm": 105.625, |
| "learning_rate": 1.5012654855468228e-07, |
| "loss": 76.0748, |
| "step": 7215 |
| }, |
| { |
| "epoch": 0.8654739428811172, |
| "grad_norm": 105.5, |
| "learning_rate": 1.494605035300386e-07, |
| "loss": 75.3836, |
| "step": 7220 |
| }, |
| { |
| "epoch": 0.866073301567323, |
| "grad_norm": 105.375, |
| "learning_rate": 1.4879445850539496e-07, |
| "loss": 75.3334, |
| "step": 7225 |
| }, |
| { |
| "epoch": 0.8666726602535287, |
| "grad_norm": 109.375, |
| "learning_rate": 1.481284134807513e-07, |
| "loss": 76.1473, |
| "step": 7230 |
| }, |
| { |
| "epoch": 0.8672720189397345, |
| "grad_norm": 108.1875, |
| "learning_rate": 1.4746236845610764e-07, |
| "loss": 76.3155, |
| "step": 7235 |
| }, |
| { |
| "epoch": 0.8678713776259402, |
| "grad_norm": 110.5, |
| "learning_rate": 1.4679632343146394e-07, |
| "loss": 76.8435, |
| "step": 7240 |
| }, |
| { |
| "epoch": 0.868470736312146, |
| "grad_norm": 104.75, |
| "learning_rate": 1.461302784068203e-07, |
| "loss": 77.2809, |
| "step": 7245 |
| }, |
| { |
| "epoch": 0.8690700949983517, |
| "grad_norm": 106.9375, |
| "learning_rate": 1.4546423338217662e-07, |
| "loss": 77.4855, |
| "step": 7250 |
| }, |
| { |
| "epoch": 0.8696694536845575, |
| "grad_norm": 106.375, |
| "learning_rate": 1.4479818835753297e-07, |
| "loss": 76.6354, |
| "step": 7255 |
| }, |
| { |
| "epoch": 0.8702688123707633, |
| "grad_norm": 104.3125, |
| "learning_rate": 1.441321433328893e-07, |
| "loss": 75.2417, |
| "step": 7260 |
| }, |
| { |
| "epoch": 0.870868171056969, |
| "grad_norm": 107.8125, |
| "learning_rate": 1.4346609830824562e-07, |
| "loss": 77.3617, |
| "step": 7265 |
| }, |
| { |
| "epoch": 0.8714675297431748, |
| "grad_norm": 107.3125, |
| "learning_rate": 1.4280005328360197e-07, |
| "loss": 75.9692, |
| "step": 7270 |
| }, |
| { |
| "epoch": 0.8720668884293805, |
| "grad_norm": 106.3125, |
| "learning_rate": 1.421340082589583e-07, |
| "loss": 77.6722, |
| "step": 7275 |
| }, |
| { |
| "epoch": 0.8726662471155863, |
| "grad_norm": 108.375, |
| "learning_rate": 1.4146796323431465e-07, |
| "loss": 77.6011, |
| "step": 7280 |
| }, |
| { |
| "epoch": 0.8732656058017921, |
| "grad_norm": 105.5, |
| "learning_rate": 1.4080191820967098e-07, |
| "loss": 76.7692, |
| "step": 7285 |
| }, |
| { |
| "epoch": 0.8738649644879979, |
| "grad_norm": 105.125, |
| "learning_rate": 1.4013587318502733e-07, |
| "loss": 77.2192, |
| "step": 7290 |
| }, |
| { |
| "epoch": 0.8744643231742036, |
| "grad_norm": 109.125, |
| "learning_rate": 1.3946982816038363e-07, |
| "loss": 76.9221, |
| "step": 7295 |
| }, |
| { |
| "epoch": 0.8750636818604094, |
| "grad_norm": 107.9375, |
| "learning_rate": 1.3880378313573998e-07, |
| "loss": 76.3955, |
| "step": 7300 |
| }, |
| { |
| "epoch": 0.8756630405466151, |
| "grad_norm": 107.0, |
| "learning_rate": 1.381377381110963e-07, |
| "loss": 75.523, |
| "step": 7305 |
| }, |
| { |
| "epoch": 0.8762623992328209, |
| "grad_norm": 107.625, |
| "learning_rate": 1.3747169308645266e-07, |
| "loss": 77.1425, |
| "step": 7310 |
| }, |
| { |
| "epoch": 0.8768617579190267, |
| "grad_norm": 109.875, |
| "learning_rate": 1.3680564806180898e-07, |
| "loss": 76.5368, |
| "step": 7315 |
| }, |
| { |
| "epoch": 0.8774611166052324, |
| "grad_norm": 107.25, |
| "learning_rate": 1.361396030371653e-07, |
| "loss": 76.536, |
| "step": 7320 |
| }, |
| { |
| "epoch": 0.8780604752914382, |
| "grad_norm": 108.8125, |
| "learning_rate": 1.3547355801252163e-07, |
| "loss": 76.2158, |
| "step": 7325 |
| }, |
| { |
| "epoch": 0.8786598339776439, |
| "grad_norm": 108.625, |
| "learning_rate": 1.3480751298787798e-07, |
| "loss": 76.2001, |
| "step": 7330 |
| }, |
| { |
| "epoch": 0.8792591926638497, |
| "grad_norm": 105.6875, |
| "learning_rate": 1.341414679632343e-07, |
| "loss": 74.994, |
| "step": 7335 |
| }, |
| { |
| "epoch": 0.8798585513500554, |
| "grad_norm": 105.4375, |
| "learning_rate": 1.3347542293859066e-07, |
| "loss": 76.1967, |
| "step": 7340 |
| }, |
| { |
| "epoch": 0.8804579100362612, |
| "grad_norm": 106.0625, |
| "learning_rate": 1.3280937791394696e-07, |
| "loss": 76.307, |
| "step": 7345 |
| }, |
| { |
| "epoch": 0.8810572687224669, |
| "grad_norm": 108.125, |
| "learning_rate": 1.3214333288930331e-07, |
| "loss": 76.0177, |
| "step": 7350 |
| }, |
| { |
| "epoch": 0.8816566274086727, |
| "grad_norm": 106.4375, |
| "learning_rate": 1.3147728786465964e-07, |
| "loss": 77.1298, |
| "step": 7355 |
| }, |
| { |
| "epoch": 0.8822559860948784, |
| "grad_norm": 107.3125, |
| "learning_rate": 1.30811242840016e-07, |
| "loss": 74.9574, |
| "step": 7360 |
| }, |
| { |
| "epoch": 0.8828553447810843, |
| "grad_norm": 109.0, |
| "learning_rate": 1.3014519781537232e-07, |
| "loss": 77.0243, |
| "step": 7365 |
| }, |
| { |
| "epoch": 0.88345470346729, |
| "grad_norm": 107.5625, |
| "learning_rate": 1.2947915279072867e-07, |
| "loss": 76.6947, |
| "step": 7370 |
| }, |
| { |
| "epoch": 0.8840540621534958, |
| "grad_norm": 109.6875, |
| "learning_rate": 1.2881310776608497e-07, |
| "loss": 76.0506, |
| "step": 7375 |
| }, |
| { |
| "epoch": 0.8846534208397016, |
| "grad_norm": 111.5, |
| "learning_rate": 1.2814706274144132e-07, |
| "loss": 75.8644, |
| "step": 7380 |
| }, |
| { |
| "epoch": 0.8852527795259073, |
| "grad_norm": 107.5625, |
| "learning_rate": 1.2748101771679765e-07, |
| "loss": 76.0396, |
| "step": 7385 |
| }, |
| { |
| "epoch": 0.8858521382121131, |
| "grad_norm": 108.5, |
| "learning_rate": 1.26814972692154e-07, |
| "loss": 75.8297, |
| "step": 7390 |
| }, |
| { |
| "epoch": 0.8864514968983188, |
| "grad_norm": 104.625, |
| "learning_rate": 1.2614892766751032e-07, |
| "loss": 76.7324, |
| "step": 7395 |
| }, |
| { |
| "epoch": 0.8870508555845246, |
| "grad_norm": 106.625, |
| "learning_rate": 1.2548288264286665e-07, |
| "loss": 75.7794, |
| "step": 7400 |
| }, |
| { |
| "epoch": 0.8876502142707303, |
| "grad_norm": 107.5, |
| "learning_rate": 1.2481683761822297e-07, |
| "loss": 75.7284, |
| "step": 7405 |
| }, |
| { |
| "epoch": 0.8882495729569361, |
| "grad_norm": 106.375, |
| "learning_rate": 1.2415079259357933e-07, |
| "loss": 76.7833, |
| "step": 7410 |
| }, |
| { |
| "epoch": 0.8888489316431418, |
| "grad_norm": 110.5, |
| "learning_rate": 1.2348474756893565e-07, |
| "loss": 75.3238, |
| "step": 7415 |
| }, |
| { |
| "epoch": 0.8894482903293476, |
| "grad_norm": 107.5, |
| "learning_rate": 1.2281870254429198e-07, |
| "loss": 74.8932, |
| "step": 7420 |
| }, |
| { |
| "epoch": 0.8900476490155533, |
| "grad_norm": 110.4375, |
| "learning_rate": 1.2215265751964833e-07, |
| "loss": 76.3159, |
| "step": 7425 |
| }, |
| { |
| "epoch": 0.8906470077017591, |
| "grad_norm": 109.125, |
| "learning_rate": 1.2148661249500465e-07, |
| "loss": 76.1404, |
| "step": 7430 |
| }, |
| { |
| "epoch": 0.8912463663879648, |
| "grad_norm": 108.875, |
| "learning_rate": 1.2082056747036098e-07, |
| "loss": 76.7653, |
| "step": 7435 |
| }, |
| { |
| "epoch": 0.8918457250741706, |
| "grad_norm": 107.625, |
| "learning_rate": 1.201545224457173e-07, |
| "loss": 77.1986, |
| "step": 7440 |
| }, |
| { |
| "epoch": 0.8924450837603763, |
| "grad_norm": 107.875, |
| "learning_rate": 1.1948847742107366e-07, |
| "loss": 76.5664, |
| "step": 7445 |
| }, |
| { |
| "epoch": 0.8930444424465822, |
| "grad_norm": 104.5625, |
| "learning_rate": 1.1882243239643e-07, |
| "loss": 76.0746, |
| "step": 7450 |
| }, |
| { |
| "epoch": 0.893643801132788, |
| "grad_norm": 105.75, |
| "learning_rate": 1.1815638737178634e-07, |
| "loss": 75.1047, |
| "step": 7455 |
| }, |
| { |
| "epoch": 0.8942431598189937, |
| "grad_norm": 105.75, |
| "learning_rate": 1.1749034234714267e-07, |
| "loss": 74.8203, |
| "step": 7460 |
| }, |
| { |
| "epoch": 0.8948425185051995, |
| "grad_norm": 105.5, |
| "learning_rate": 1.16824297322499e-07, |
| "loss": 76.4513, |
| "step": 7465 |
| }, |
| { |
| "epoch": 0.8954418771914052, |
| "grad_norm": 108.875, |
| "learning_rate": 1.1615825229785534e-07, |
| "loss": 75.8262, |
| "step": 7470 |
| }, |
| { |
| "epoch": 0.896041235877611, |
| "grad_norm": 107.6875, |
| "learning_rate": 1.1549220727321166e-07, |
| "loss": 75.3952, |
| "step": 7475 |
| }, |
| { |
| "epoch": 0.8966405945638167, |
| "grad_norm": 106.0, |
| "learning_rate": 1.14826162248568e-07, |
| "loss": 75.8961, |
| "step": 7480 |
| }, |
| { |
| "epoch": 0.8972399532500225, |
| "grad_norm": 111.1875, |
| "learning_rate": 1.1416011722392434e-07, |
| "loss": 76.1357, |
| "step": 7485 |
| }, |
| { |
| "epoch": 0.8978393119362282, |
| "grad_norm": 107.8125, |
| "learning_rate": 1.1349407219928067e-07, |
| "loss": 76.3188, |
| "step": 7490 |
| }, |
| { |
| "epoch": 0.898438670622434, |
| "grad_norm": 105.5, |
| "learning_rate": 1.12828027174637e-07, |
| "loss": 76.0742, |
| "step": 7495 |
| }, |
| { |
| "epoch": 0.8990380293086397, |
| "grad_norm": 109.625, |
| "learning_rate": 1.1216198214999335e-07, |
| "loss": 75.6454, |
| "step": 7500 |
| }, |
| { |
| "epoch": 0.8990380293086397, |
| "eval_loss": 2.3795347213745117, |
| "eval_runtime": 400.1917, |
| "eval_samples_per_second": 1123.469, |
| "eval_steps_per_second": 35.111, |
| "step": 7500 |
| }, |
| { |
| "epoch": 0.8996373879948455, |
| "grad_norm": 106.25, |
| "learning_rate": 1.1149593712534967e-07, |
| "loss": 75.9472, |
| "step": 7505 |
| }, |
| { |
| "epoch": 0.9002367466810512, |
| "grad_norm": 109.3125, |
| "learning_rate": 1.1082989210070601e-07, |
| "loss": 76.2319, |
| "step": 7510 |
| }, |
| { |
| "epoch": 0.900836105367257, |
| "grad_norm": 108.1875, |
| "learning_rate": 1.1016384707606233e-07, |
| "loss": 76.4846, |
| "step": 7515 |
| }, |
| { |
| "epoch": 0.9014354640534628, |
| "grad_norm": 108.75, |
| "learning_rate": 1.0949780205141867e-07, |
| "loss": 77.3349, |
| "step": 7520 |
| }, |
| { |
| "epoch": 0.9020348227396685, |
| "grad_norm": 104.9375, |
| "learning_rate": 1.0883175702677501e-07, |
| "loss": 74.6866, |
| "step": 7525 |
| }, |
| { |
| "epoch": 0.9026341814258743, |
| "grad_norm": 110.25, |
| "learning_rate": 1.0816571200213134e-07, |
| "loss": 75.6139, |
| "step": 7530 |
| }, |
| { |
| "epoch": 0.9032335401120801, |
| "grad_norm": 108.4375, |
| "learning_rate": 1.0749966697748768e-07, |
| "loss": 76.1253, |
| "step": 7535 |
| }, |
| { |
| "epoch": 0.9038328987982859, |
| "grad_norm": 107.5, |
| "learning_rate": 1.0683362195284402e-07, |
| "loss": 75.2204, |
| "step": 7540 |
| }, |
| { |
| "epoch": 0.9044322574844916, |
| "grad_norm": 105.5, |
| "learning_rate": 1.0616757692820034e-07, |
| "loss": 75.8417, |
| "step": 7545 |
| }, |
| { |
| "epoch": 0.9050316161706974, |
| "grad_norm": 105.1875, |
| "learning_rate": 1.0550153190355668e-07, |
| "loss": 75.7311, |
| "step": 7550 |
| }, |
| { |
| "epoch": 0.9056309748569031, |
| "grad_norm": 101.625, |
| "learning_rate": 1.0483548687891302e-07, |
| "loss": 74.4821, |
| "step": 7555 |
| }, |
| { |
| "epoch": 0.9062303335431089, |
| "grad_norm": 106.4375, |
| "learning_rate": 1.0416944185426934e-07, |
| "loss": 75.5503, |
| "step": 7560 |
| }, |
| { |
| "epoch": 0.9068296922293146, |
| "grad_norm": 104.5, |
| "learning_rate": 1.0350339682962568e-07, |
| "loss": 76.8925, |
| "step": 7565 |
| }, |
| { |
| "epoch": 0.9074290509155204, |
| "grad_norm": 111.125, |
| "learning_rate": 1.0283735180498201e-07, |
| "loss": 76.4836, |
| "step": 7570 |
| }, |
| { |
| "epoch": 0.9080284096017261, |
| "grad_norm": 104.75, |
| "learning_rate": 1.0217130678033835e-07, |
| "loss": 77.4133, |
| "step": 7575 |
| }, |
| { |
| "epoch": 0.9086277682879319, |
| "grad_norm": 110.5, |
| "learning_rate": 1.0150526175569469e-07, |
| "loss": 76.1813, |
| "step": 7580 |
| }, |
| { |
| "epoch": 0.9092271269741377, |
| "grad_norm": 106.8125, |
| "learning_rate": 1.0083921673105101e-07, |
| "loss": 75.3809, |
| "step": 7585 |
| }, |
| { |
| "epoch": 0.9098264856603434, |
| "grad_norm": 104.5625, |
| "learning_rate": 1.0017317170640735e-07, |
| "loss": 76.3577, |
| "step": 7590 |
| }, |
| { |
| "epoch": 0.9104258443465492, |
| "grad_norm": 106.8125, |
| "learning_rate": 9.950712668176369e-08, |
| "loss": 75.8196, |
| "step": 7595 |
| }, |
| { |
| "epoch": 0.9110252030327549, |
| "grad_norm": 107.0625, |
| "learning_rate": 9.884108165712002e-08, |
| "loss": 76.5003, |
| "step": 7600 |
| }, |
| { |
| "epoch": 0.9116245617189607, |
| "grad_norm": 107.125, |
| "learning_rate": 9.817503663247635e-08, |
| "loss": 75.7129, |
| "step": 7605 |
| }, |
| { |
| "epoch": 0.9122239204051664, |
| "grad_norm": 106.9375, |
| "learning_rate": 9.750899160783268e-08, |
| "loss": 77.2414, |
| "step": 7610 |
| }, |
| { |
| "epoch": 0.9128232790913723, |
| "grad_norm": 105.3125, |
| "learning_rate": 9.684294658318902e-08, |
| "loss": 75.8869, |
| "step": 7615 |
| }, |
| { |
| "epoch": 0.913422637777578, |
| "grad_norm": 107.1875, |
| "learning_rate": 9.617690155854536e-08, |
| "loss": 74.8393, |
| "step": 7620 |
| }, |
| { |
| "epoch": 0.9140219964637838, |
| "grad_norm": 107.25, |
| "learning_rate": 9.551085653390168e-08, |
| "loss": 75.3144, |
| "step": 7625 |
| }, |
| { |
| "epoch": 0.9146213551499895, |
| "grad_norm": 106.5625, |
| "learning_rate": 9.484481150925802e-08, |
| "loss": 75.7677, |
| "step": 7630 |
| }, |
| { |
| "epoch": 0.9152207138361953, |
| "grad_norm": 106.625, |
| "learning_rate": 9.417876648461436e-08, |
| "loss": 75.8197, |
| "step": 7635 |
| }, |
| { |
| "epoch": 0.915820072522401, |
| "grad_norm": 108.875, |
| "learning_rate": 9.351272145997069e-08, |
| "loss": 75.7213, |
| "step": 7640 |
| }, |
| { |
| "epoch": 0.9164194312086068, |
| "grad_norm": 105.125, |
| "learning_rate": 9.284667643532702e-08, |
| "loss": 74.8678, |
| "step": 7645 |
| }, |
| { |
| "epoch": 0.9170187898948126, |
| "grad_norm": 109.5625, |
| "learning_rate": 9.218063141068336e-08, |
| "loss": 77.0629, |
| "step": 7650 |
| }, |
| { |
| "epoch": 0.9176181485810183, |
| "grad_norm": 106.6875, |
| "learning_rate": 9.151458638603969e-08, |
| "loss": 76.5193, |
| "step": 7655 |
| }, |
| { |
| "epoch": 0.9182175072672241, |
| "grad_norm": 109.1875, |
| "learning_rate": 9.084854136139603e-08, |
| "loss": 76.135, |
| "step": 7660 |
| }, |
| { |
| "epoch": 0.9188168659534298, |
| "grad_norm": 108.1875, |
| "learning_rate": 9.018249633675235e-08, |
| "loss": 75.7558, |
| "step": 7665 |
| }, |
| { |
| "epoch": 0.9194162246396356, |
| "grad_norm": 106.0625, |
| "learning_rate": 8.951645131210869e-08, |
| "loss": 76.6425, |
| "step": 7670 |
| }, |
| { |
| "epoch": 0.9200155833258413, |
| "grad_norm": 110.625, |
| "learning_rate": 8.885040628746503e-08, |
| "loss": 75.9904, |
| "step": 7675 |
| }, |
| { |
| "epoch": 0.9206149420120471, |
| "grad_norm": 106.8125, |
| "learning_rate": 8.818436126282136e-08, |
| "loss": 76.6544, |
| "step": 7680 |
| }, |
| { |
| "epoch": 0.9212143006982528, |
| "grad_norm": 105.375, |
| "learning_rate": 8.75183162381777e-08, |
| "loss": 76.6804, |
| "step": 7685 |
| }, |
| { |
| "epoch": 0.9218136593844586, |
| "grad_norm": 107.75, |
| "learning_rate": 8.685227121353403e-08, |
| "loss": 75.9191, |
| "step": 7690 |
| }, |
| { |
| "epoch": 0.9224130180706643, |
| "grad_norm": 105.9375, |
| "learning_rate": 8.618622618889036e-08, |
| "loss": 76.0447, |
| "step": 7695 |
| }, |
| { |
| "epoch": 0.9230123767568702, |
| "grad_norm": 107.75, |
| "learning_rate": 8.55201811642467e-08, |
| "loss": 75.6943, |
| "step": 7700 |
| }, |
| { |
| "epoch": 0.923611735443076, |
| "grad_norm": 108.4375, |
| "learning_rate": 8.485413613960302e-08, |
| "loss": 76.1021, |
| "step": 7705 |
| }, |
| { |
| "epoch": 0.9242110941292817, |
| "grad_norm": 106.3125, |
| "learning_rate": 8.418809111495936e-08, |
| "loss": 75.5128, |
| "step": 7710 |
| }, |
| { |
| "epoch": 0.9248104528154875, |
| "grad_norm": 107.6875, |
| "learning_rate": 8.35220460903157e-08, |
| "loss": 75.4414, |
| "step": 7715 |
| }, |
| { |
| "epoch": 0.9254098115016932, |
| "grad_norm": 107.9375, |
| "learning_rate": 8.285600106567203e-08, |
| "loss": 76.4457, |
| "step": 7720 |
| }, |
| { |
| "epoch": 0.926009170187899, |
| "grad_norm": 110.375, |
| "learning_rate": 8.218995604102837e-08, |
| "loss": 75.3731, |
| "step": 7725 |
| }, |
| { |
| "epoch": 0.9266085288741047, |
| "grad_norm": 106.25, |
| "learning_rate": 8.15239110163847e-08, |
| "loss": 75.2117, |
| "step": 7730 |
| }, |
| { |
| "epoch": 0.9272078875603105, |
| "grad_norm": 107.8125, |
| "learning_rate": 8.085786599174103e-08, |
| "loss": 76.5782, |
| "step": 7735 |
| }, |
| { |
| "epoch": 0.9278072462465162, |
| "grad_norm": 107.75, |
| "learning_rate": 8.019182096709737e-08, |
| "loss": 76.7815, |
| "step": 7740 |
| }, |
| { |
| "epoch": 0.928406604932722, |
| "grad_norm": 109.4375, |
| "learning_rate": 7.952577594245371e-08, |
| "loss": 76.2764, |
| "step": 7745 |
| }, |
| { |
| "epoch": 0.9290059636189277, |
| "grad_norm": 108.0, |
| "learning_rate": 7.885973091781003e-08, |
| "loss": 75.9319, |
| "step": 7750 |
| }, |
| { |
| "epoch": 0.9296053223051335, |
| "grad_norm": 107.5625, |
| "learning_rate": 7.819368589316637e-08, |
| "loss": 75.9743, |
| "step": 7755 |
| }, |
| { |
| "epoch": 0.9302046809913392, |
| "grad_norm": 109.5, |
| "learning_rate": 7.75276408685227e-08, |
| "loss": 74.1208, |
| "step": 7760 |
| }, |
| { |
| "epoch": 0.930804039677545, |
| "grad_norm": 108.75, |
| "learning_rate": 7.686159584387904e-08, |
| "loss": 75.7255, |
| "step": 7765 |
| }, |
| { |
| "epoch": 0.9314033983637507, |
| "grad_norm": 104.6875, |
| "learning_rate": 7.619555081923538e-08, |
| "loss": 75.9585, |
| "step": 7770 |
| }, |
| { |
| "epoch": 0.9320027570499565, |
| "grad_norm": 103.75, |
| "learning_rate": 7.55295057945917e-08, |
| "loss": 75.6327, |
| "step": 7775 |
| }, |
| { |
| "epoch": 0.9326021157361623, |
| "grad_norm": 105.5625, |
| "learning_rate": 7.486346076994804e-08, |
| "loss": 76.4223, |
| "step": 7780 |
| }, |
| { |
| "epoch": 0.9332014744223681, |
| "grad_norm": 107.1875, |
| "learning_rate": 7.419741574530438e-08, |
| "loss": 74.8369, |
| "step": 7785 |
| }, |
| { |
| "epoch": 0.9338008331085739, |
| "grad_norm": 108.3125, |
| "learning_rate": 7.35313707206607e-08, |
| "loss": 75.6269, |
| "step": 7790 |
| }, |
| { |
| "epoch": 0.9344001917947796, |
| "grad_norm": 105.8125, |
| "learning_rate": 7.286532569601704e-08, |
| "loss": 76.3296, |
| "step": 7795 |
| }, |
| { |
| "epoch": 0.9349995504809854, |
| "grad_norm": 105.875, |
| "learning_rate": 7.219928067137337e-08, |
| "loss": 73.8214, |
| "step": 7800 |
| }, |
| { |
| "epoch": 0.9355989091671911, |
| "grad_norm": 104.0625, |
| "learning_rate": 7.153323564672971e-08, |
| "loss": 74.8871, |
| "step": 7805 |
| }, |
| { |
| "epoch": 0.9361982678533969, |
| "grad_norm": 107.25, |
| "learning_rate": 7.086719062208606e-08, |
| "loss": 76.116, |
| "step": 7810 |
| }, |
| { |
| "epoch": 0.9367976265396026, |
| "grad_norm": 105.0625, |
| "learning_rate": 7.020114559744239e-08, |
| "loss": 75.9282, |
| "step": 7815 |
| }, |
| { |
| "epoch": 0.9373969852258084, |
| "grad_norm": 108.8125, |
| "learning_rate": 6.953510057279872e-08, |
| "loss": 75.725, |
| "step": 7820 |
| }, |
| { |
| "epoch": 0.9379963439120141, |
| "grad_norm": 108.125, |
| "learning_rate": 6.886905554815506e-08, |
| "loss": 75.2024, |
| "step": 7825 |
| }, |
| { |
| "epoch": 0.9385957025982199, |
| "grad_norm": 105.0625, |
| "learning_rate": 6.820301052351139e-08, |
| "loss": 74.4366, |
| "step": 7830 |
| }, |
| { |
| "epoch": 0.9391950612844256, |
| "grad_norm": 105.3125, |
| "learning_rate": 6.753696549886773e-08, |
| "loss": 75.688, |
| "step": 7835 |
| }, |
| { |
| "epoch": 0.9397944199706314, |
| "grad_norm": 105.9375, |
| "learning_rate": 6.687092047422407e-08, |
| "loss": 75.9839, |
| "step": 7840 |
| }, |
| { |
| "epoch": 0.9403937786568372, |
| "grad_norm": 106.875, |
| "learning_rate": 6.620487544958039e-08, |
| "loss": 75.7041, |
| "step": 7845 |
| }, |
| { |
| "epoch": 0.9409931373430429, |
| "grad_norm": 107.4375, |
| "learning_rate": 6.553883042493673e-08, |
| "loss": 75.4809, |
| "step": 7850 |
| }, |
| { |
| "epoch": 0.9415924960292487, |
| "grad_norm": 106.1875, |
| "learning_rate": 6.487278540029306e-08, |
| "loss": 75.421, |
| "step": 7855 |
| }, |
| { |
| "epoch": 0.9421918547154544, |
| "grad_norm": 108.4375, |
| "learning_rate": 6.42067403756494e-08, |
| "loss": 76.3878, |
| "step": 7860 |
| }, |
| { |
| "epoch": 0.9427912134016603, |
| "grad_norm": 102.6875, |
| "learning_rate": 6.354069535100573e-08, |
| "loss": 75.1176, |
| "step": 7865 |
| }, |
| { |
| "epoch": 0.943390572087866, |
| "grad_norm": 107.6875, |
| "learning_rate": 6.287465032636206e-08, |
| "loss": 75.2976, |
| "step": 7870 |
| }, |
| { |
| "epoch": 0.9439899307740718, |
| "grad_norm": 106.375, |
| "learning_rate": 6.220860530171838e-08, |
| "loss": 75.4411, |
| "step": 7875 |
| }, |
| { |
| "epoch": 0.9445892894602775, |
| "grad_norm": 110.125, |
| "learning_rate": 6.154256027707472e-08, |
| "loss": 75.5738, |
| "step": 7880 |
| }, |
| { |
| "epoch": 0.9451886481464833, |
| "grad_norm": 104.75, |
| "learning_rate": 6.087651525243106e-08, |
| "loss": 75.0076, |
| "step": 7885 |
| }, |
| { |
| "epoch": 0.945788006832689, |
| "grad_norm": 106.3125, |
| "learning_rate": 6.021047022778739e-08, |
| "loss": 75.3758, |
| "step": 7890 |
| }, |
| { |
| "epoch": 0.9463873655188948, |
| "grad_norm": 103.875, |
| "learning_rate": 5.9544425203143727e-08, |
| "loss": 75.9719, |
| "step": 7895 |
| }, |
| { |
| "epoch": 0.9469867242051005, |
| "grad_norm": 106.75, |
| "learning_rate": 5.8878380178500065e-08, |
| "loss": 75.7151, |
| "step": 7900 |
| }, |
| { |
| "epoch": 0.9475860828913063, |
| "grad_norm": 106.4375, |
| "learning_rate": 5.8212335153856404e-08, |
| "loss": 74.1712, |
| "step": 7905 |
| }, |
| { |
| "epoch": 0.9481854415775121, |
| "grad_norm": 107.0, |
| "learning_rate": 5.7546290129212736e-08, |
| "loss": 76.8725, |
| "step": 7910 |
| }, |
| { |
| "epoch": 0.9487848002637178, |
| "grad_norm": 105.125, |
| "learning_rate": 5.688024510456907e-08, |
| "loss": 76.729, |
| "step": 7915 |
| }, |
| { |
| "epoch": 0.9493841589499236, |
| "grad_norm": 110.8125, |
| "learning_rate": 5.62142000799254e-08, |
| "loss": 75.4301, |
| "step": 7920 |
| }, |
| { |
| "epoch": 0.9499835176361293, |
| "grad_norm": 109.9375, |
| "learning_rate": 5.554815505528174e-08, |
| "loss": 75.4277, |
| "step": 7925 |
| }, |
| { |
| "epoch": 0.9505828763223351, |
| "grad_norm": 109.0625, |
| "learning_rate": 5.488211003063807e-08, |
| "loss": 74.834, |
| "step": 7930 |
| }, |
| { |
| "epoch": 0.9511822350085408, |
| "grad_norm": 104.0, |
| "learning_rate": 5.4216065005994404e-08, |
| "loss": 75.1636, |
| "step": 7935 |
| }, |
| { |
| "epoch": 0.9517815936947466, |
| "grad_norm": 108.625, |
| "learning_rate": 5.3550019981350736e-08, |
| "loss": 76.5781, |
| "step": 7940 |
| }, |
| { |
| "epoch": 0.9523809523809523, |
| "grad_norm": 108.125, |
| "learning_rate": 5.2883974956707075e-08, |
| "loss": 75.4816, |
| "step": 7945 |
| }, |
| { |
| "epoch": 0.9529803110671582, |
| "grad_norm": 104.625, |
| "learning_rate": 5.221792993206341e-08, |
| "loss": 75.2837, |
| "step": 7950 |
| }, |
| { |
| "epoch": 0.953579669753364, |
| "grad_norm": 105.5625, |
| "learning_rate": 5.155188490741974e-08, |
| "loss": 75.0366, |
| "step": 7955 |
| }, |
| { |
| "epoch": 0.9541790284395697, |
| "grad_norm": 110.5, |
| "learning_rate": 5.088583988277607e-08, |
| "loss": 74.8486, |
| "step": 7960 |
| }, |
| { |
| "epoch": 0.9547783871257755, |
| "grad_norm": 106.125, |
| "learning_rate": 5.021979485813241e-08, |
| "loss": 74.3942, |
| "step": 7965 |
| }, |
| { |
| "epoch": 0.9553777458119812, |
| "grad_norm": 108.1875, |
| "learning_rate": 4.955374983348874e-08, |
| "loss": 75.9698, |
| "step": 7970 |
| }, |
| { |
| "epoch": 0.955977104498187, |
| "grad_norm": 105.5, |
| "learning_rate": 4.8887704808845075e-08, |
| "loss": 74.0582, |
| "step": 7975 |
| }, |
| { |
| "epoch": 0.9565764631843927, |
| "grad_norm": 108.125, |
| "learning_rate": 4.822165978420141e-08, |
| "loss": 75.5124, |
| "step": 7980 |
| }, |
| { |
| "epoch": 0.9571758218705985, |
| "grad_norm": 109.0625, |
| "learning_rate": 4.7555614759557746e-08, |
| "loss": 75.5287, |
| "step": 7985 |
| }, |
| { |
| "epoch": 0.9577751805568042, |
| "grad_norm": 105.25, |
| "learning_rate": 4.688956973491408e-08, |
| "loss": 74.6588, |
| "step": 7990 |
| }, |
| { |
| "epoch": 0.95837453924301, |
| "grad_norm": 108.25, |
| "learning_rate": 4.622352471027041e-08, |
| "loss": 74.4629, |
| "step": 7995 |
| }, |
| { |
| "epoch": 0.9589738979292157, |
| "grad_norm": 107.0625, |
| "learning_rate": 4.555747968562675e-08, |
| "loss": 74.7334, |
| "step": 8000 |
| }, |
| { |
| "epoch": 0.9589738979292157, |
| "eval_loss": 2.360534191131592, |
| "eval_runtime": 408.2462, |
| "eval_samples_per_second": 1101.304, |
| "eval_steps_per_second": 34.418, |
| "step": 8000 |
| }, |
| { |
| "epoch": 0.9595732566154215, |
| "grad_norm": 102.625, |
| "learning_rate": 4.489143466098308e-08, |
| "loss": 76.2619, |
| "step": 8005 |
| }, |
| { |
| "epoch": 0.9601726153016272, |
| "grad_norm": 111.1875, |
| "learning_rate": 4.422538963633941e-08, |
| "loss": 74.1844, |
| "step": 8010 |
| }, |
| { |
| "epoch": 0.960771973987833, |
| "grad_norm": 104.0, |
| "learning_rate": 4.3559344611695745e-08, |
| "loss": 76.1778, |
| "step": 8015 |
| }, |
| { |
| "epoch": 0.9613713326740387, |
| "grad_norm": 106.9375, |
| "learning_rate": 4.2893299587052084e-08, |
| "loss": 77.1481, |
| "step": 8020 |
| }, |
| { |
| "epoch": 0.9619706913602445, |
| "grad_norm": 105.9375, |
| "learning_rate": 4.2227254562408416e-08, |
| "loss": 75.8017, |
| "step": 8025 |
| }, |
| { |
| "epoch": 0.9625700500464502, |
| "grad_norm": 106.0, |
| "learning_rate": 4.156120953776475e-08, |
| "loss": 75.1555, |
| "step": 8030 |
| }, |
| { |
| "epoch": 0.9631694087326561, |
| "grad_norm": 108.375, |
| "learning_rate": 4.089516451312108e-08, |
| "loss": 75.7826, |
| "step": 8035 |
| }, |
| { |
| "epoch": 0.9637687674188619, |
| "grad_norm": 106.5, |
| "learning_rate": 4.022911948847742e-08, |
| "loss": 73.2987, |
| "step": 8040 |
| }, |
| { |
| "epoch": 0.9643681261050676, |
| "grad_norm": 105.0, |
| "learning_rate": 3.956307446383375e-08, |
| "loss": 74.986, |
| "step": 8045 |
| }, |
| { |
| "epoch": 0.9649674847912734, |
| "grad_norm": 104.5, |
| "learning_rate": 3.8897029439190084e-08, |
| "loss": 75.9831, |
| "step": 8050 |
| }, |
| { |
| "epoch": 0.9655668434774791, |
| "grad_norm": 107.0, |
| "learning_rate": 3.8230984414546416e-08, |
| "loss": 75.217, |
| "step": 8055 |
| }, |
| { |
| "epoch": 0.9661662021636849, |
| "grad_norm": 108.625, |
| "learning_rate": 3.7564939389902755e-08, |
| "loss": 76.5886, |
| "step": 8060 |
| }, |
| { |
| "epoch": 0.9667655608498906, |
| "grad_norm": 110.6875, |
| "learning_rate": 3.689889436525909e-08, |
| "loss": 75.2561, |
| "step": 8065 |
| }, |
| { |
| "epoch": 0.9673649195360964, |
| "grad_norm": 107.0625, |
| "learning_rate": 3.623284934061542e-08, |
| "loss": 75.0127, |
| "step": 8070 |
| }, |
| { |
| "epoch": 0.9679642782223021, |
| "grad_norm": 106.8125, |
| "learning_rate": 3.5566804315971765e-08, |
| "loss": 76.2585, |
| "step": 8075 |
| }, |
| { |
| "epoch": 0.9685636369085079, |
| "grad_norm": 102.5625, |
| "learning_rate": 3.49007592913281e-08, |
| "loss": 75.7222, |
| "step": 8080 |
| }, |
| { |
| "epoch": 0.9691629955947136, |
| "grad_norm": 105.625, |
| "learning_rate": 3.423471426668443e-08, |
| "loss": 75.6719, |
| "step": 8085 |
| }, |
| { |
| "epoch": 0.9697623542809194, |
| "grad_norm": 106.1875, |
| "learning_rate": 3.356866924204076e-08, |
| "loss": 74.9915, |
| "step": 8090 |
| }, |
| { |
| "epoch": 0.9703617129671251, |
| "grad_norm": 106.4375, |
| "learning_rate": 3.29026242173971e-08, |
| "loss": 75.2533, |
| "step": 8095 |
| }, |
| { |
| "epoch": 0.9709610716533309, |
| "grad_norm": 102.875, |
| "learning_rate": 3.223657919275343e-08, |
| "loss": 74.9591, |
| "step": 8100 |
| }, |
| { |
| "epoch": 0.9715604303395367, |
| "grad_norm": 106.0, |
| "learning_rate": 3.1570534168109764e-08, |
| "loss": 75.4972, |
| "step": 8105 |
| }, |
| { |
| "epoch": 0.9721597890257424, |
| "grad_norm": 110.0, |
| "learning_rate": 3.0904489143466097e-08, |
| "loss": 75.5078, |
| "step": 8110 |
| }, |
| { |
| "epoch": 0.9727591477119483, |
| "grad_norm": 109.375, |
| "learning_rate": 3.023844411882243e-08, |
| "loss": 75.4631, |
| "step": 8115 |
| }, |
| { |
| "epoch": 0.973358506398154, |
| "grad_norm": 104.6875, |
| "learning_rate": 2.9572399094178768e-08, |
| "loss": 73.9664, |
| "step": 8120 |
| }, |
| { |
| "epoch": 0.9739578650843598, |
| "grad_norm": 107.1875, |
| "learning_rate": 2.89063540695351e-08, |
| "loss": 74.6196, |
| "step": 8125 |
| }, |
| { |
| "epoch": 0.9745572237705655, |
| "grad_norm": 104.8125, |
| "learning_rate": 2.8240309044891435e-08, |
| "loss": 73.8321, |
| "step": 8130 |
| }, |
| { |
| "epoch": 0.9751565824567713, |
| "grad_norm": 106.875, |
| "learning_rate": 2.7574264020247767e-08, |
| "loss": 73.9742, |
| "step": 8135 |
| }, |
| { |
| "epoch": 0.975755941142977, |
| "grad_norm": 105.4375, |
| "learning_rate": 2.6908218995604103e-08, |
| "loss": 74.6413, |
| "step": 8140 |
| }, |
| { |
| "epoch": 0.9763552998291828, |
| "grad_norm": 112.0625, |
| "learning_rate": 2.6242173970960435e-08, |
| "loss": 76.6869, |
| "step": 8145 |
| }, |
| { |
| "epoch": 0.9769546585153885, |
| "grad_norm": 109.125, |
| "learning_rate": 2.557612894631677e-08, |
| "loss": 75.1903, |
| "step": 8150 |
| }, |
| { |
| "epoch": 0.9775540172015943, |
| "grad_norm": 106.0625, |
| "learning_rate": 2.4910083921673103e-08, |
| "loss": 75.9438, |
| "step": 8155 |
| }, |
| { |
| "epoch": 0.9781533758878, |
| "grad_norm": 105.375, |
| "learning_rate": 2.4244038897029438e-08, |
| "loss": 75.6281, |
| "step": 8160 |
| }, |
| { |
| "epoch": 0.9787527345740058, |
| "grad_norm": 105.75, |
| "learning_rate": 2.357799387238577e-08, |
| "loss": 75.2257, |
| "step": 8165 |
| }, |
| { |
| "epoch": 0.9793520932602116, |
| "grad_norm": 106.5625, |
| "learning_rate": 2.2911948847742106e-08, |
| "loss": 75.081, |
| "step": 8170 |
| }, |
| { |
| "epoch": 0.9799514519464173, |
| "grad_norm": 107.5, |
| "learning_rate": 2.224590382309844e-08, |
| "loss": 74.4403, |
| "step": 8175 |
| }, |
| { |
| "epoch": 0.9805508106326231, |
| "grad_norm": 108.625, |
| "learning_rate": 2.1579858798454774e-08, |
| "loss": 75.8791, |
| "step": 8180 |
| }, |
| { |
| "epoch": 0.9811501693188288, |
| "grad_norm": 106.375, |
| "learning_rate": 2.091381377381111e-08, |
| "loss": 75.0459, |
| "step": 8185 |
| }, |
| { |
| "epoch": 0.9817495280050346, |
| "grad_norm": 103.625, |
| "learning_rate": 2.024776874916744e-08, |
| "loss": 74.4976, |
| "step": 8190 |
| }, |
| { |
| "epoch": 0.9823488866912403, |
| "grad_norm": 105.3125, |
| "learning_rate": 1.9581723724523777e-08, |
| "loss": 75.4353, |
| "step": 8195 |
| }, |
| { |
| "epoch": 0.9829482453774462, |
| "grad_norm": 107.5625, |
| "learning_rate": 1.891567869988011e-08, |
| "loss": 74.7069, |
| "step": 8200 |
| }, |
| { |
| "epoch": 0.9835476040636519, |
| "grad_norm": 109.3125, |
| "learning_rate": 1.8249633675236445e-08, |
| "loss": 76.061, |
| "step": 8205 |
| }, |
| { |
| "epoch": 0.9841469627498577, |
| "grad_norm": 108.3125, |
| "learning_rate": 1.758358865059278e-08, |
| "loss": 75.0838, |
| "step": 8210 |
| }, |
| { |
| "epoch": 0.9847463214360634, |
| "grad_norm": 107.0, |
| "learning_rate": 1.6917543625949116e-08, |
| "loss": 75.832, |
| "step": 8215 |
| }, |
| { |
| "epoch": 0.9853456801222692, |
| "grad_norm": 103.1875, |
| "learning_rate": 1.6251498601305448e-08, |
| "loss": 75.316, |
| "step": 8220 |
| }, |
| { |
| "epoch": 0.985945038808475, |
| "grad_norm": 104.9375, |
| "learning_rate": 1.558545357666178e-08, |
| "loss": 75.5353, |
| "step": 8225 |
| }, |
| { |
| "epoch": 0.9865443974946807, |
| "grad_norm": 108.6875, |
| "learning_rate": 1.4919408552018115e-08, |
| "loss": 75.2492, |
| "step": 8230 |
| }, |
| { |
| "epoch": 0.9871437561808865, |
| "grad_norm": 106.25, |
| "learning_rate": 1.4253363527374451e-08, |
| "loss": 74.484, |
| "step": 8235 |
| }, |
| { |
| "epoch": 0.9877431148670922, |
| "grad_norm": 110.6875, |
| "learning_rate": 1.3587318502730785e-08, |
| "loss": 75.8064, |
| "step": 8240 |
| }, |
| { |
| "epoch": 0.988342473553298, |
| "grad_norm": 109.5, |
| "learning_rate": 1.2921273478087119e-08, |
| "loss": 74.8354, |
| "step": 8245 |
| }, |
| { |
| "epoch": 0.9889418322395037, |
| "grad_norm": 108.0, |
| "learning_rate": 1.2255228453443452e-08, |
| "loss": 75.0737, |
| "step": 8250 |
| }, |
| { |
| "epoch": 0.9895411909257095, |
| "grad_norm": 105.8125, |
| "learning_rate": 1.1589183428799786e-08, |
| "loss": 75.3057, |
| "step": 8255 |
| }, |
| { |
| "epoch": 0.9901405496119152, |
| "grad_norm": 106.875, |
| "learning_rate": 1.092313840415612e-08, |
| "loss": 74.6023, |
| "step": 8260 |
| }, |
| { |
| "epoch": 0.990739908298121, |
| "grad_norm": 108.25, |
| "learning_rate": 1.0257093379512454e-08, |
| "loss": 75.8229, |
| "step": 8265 |
| }, |
| { |
| "epoch": 0.9913392669843267, |
| "grad_norm": 105.125, |
| "learning_rate": 9.591048354868788e-09, |
| "loss": 75.3764, |
| "step": 8270 |
| }, |
| { |
| "epoch": 0.9919386256705325, |
| "grad_norm": 105.0, |
| "learning_rate": 8.925003330225123e-09, |
| "loss": 75.4075, |
| "step": 8275 |
| }, |
| { |
| "epoch": 0.9925379843567383, |
| "grad_norm": 106.5625, |
| "learning_rate": 8.258958305581457e-09, |
| "loss": 75.7693, |
| "step": 8280 |
| }, |
| { |
| "epoch": 0.9931373430429441, |
| "grad_norm": 111.6875, |
| "learning_rate": 7.592913280937791e-09, |
| "loss": 76.1424, |
| "step": 8285 |
| }, |
| { |
| "epoch": 0.9937367017291499, |
| "grad_norm": 105.625, |
| "learning_rate": 6.926868256294126e-09, |
| "loss": 74.6477, |
| "step": 8290 |
| }, |
| { |
| "epoch": 0.9943360604153556, |
| "grad_norm": 106.0, |
| "learning_rate": 6.2608232316504594e-09, |
| "loss": 75.2063, |
| "step": 8295 |
| }, |
| { |
| "epoch": 0.9949354191015614, |
| "grad_norm": 104.1875, |
| "learning_rate": 5.594778207006793e-09, |
| "loss": 75.8802, |
| "step": 8300 |
| }, |
| { |
| "epoch": 0.9955347777877671, |
| "grad_norm": 108.75, |
| "learning_rate": 4.928733182363127e-09, |
| "loss": 74.0212, |
| "step": 8305 |
| }, |
| { |
| "epoch": 0.9961341364739729, |
| "grad_norm": 107.9375, |
| "learning_rate": 4.262688157719462e-09, |
| "loss": 75.8792, |
| "step": 8310 |
| }, |
| { |
| "epoch": 0.9967334951601786, |
| "grad_norm": 102.875, |
| "learning_rate": 3.596643133075796e-09, |
| "loss": 74.9816, |
| "step": 8315 |
| }, |
| { |
| "epoch": 0.9973328538463844, |
| "grad_norm": 105.3125, |
| "learning_rate": 2.93059810843213e-09, |
| "loss": 75.1708, |
| "step": 8320 |
| }, |
| { |
| "epoch": 0.9979322125325901, |
| "grad_norm": 108.8125, |
| "learning_rate": 2.2645530837884637e-09, |
| "loss": 75.8946, |
| "step": 8325 |
| }, |
| { |
| "epoch": 0.9985315712187959, |
| "grad_norm": 106.875, |
| "learning_rate": 1.5985080591447982e-09, |
| "loss": 75.4009, |
| "step": 8330 |
| }, |
| { |
| "epoch": 0.9991309299050016, |
| "grad_norm": 103.625, |
| "learning_rate": 9.324630345011322e-10, |
| "loss": 73.9491, |
| "step": 8335 |
| }, |
| { |
| "epoch": 0.9997302885912074, |
| "grad_norm": 106.25, |
| "learning_rate": 2.6641800985746636e-10, |
| "loss": 74.3839, |
| "step": 8340 |
| } |
| ], |
| "logging_steps": 5, |
| "max_steps": 8342, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 3.614053037573669e+19, |
| "train_batch_size": 4, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|