{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 20000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0005, "grad_norm": 27.75, "learning_rate": 2.25e-06, "loss": 2.866, "step": 10 }, { "epoch": 0.001, "grad_norm": 16.875, "learning_rate": 4.75e-06, "loss": 2.6711, "step": 20 }, { "epoch": 0.0015, "grad_norm": 7.59375, "learning_rate": 7.25e-06, "loss": 2.267, "step": 30 }, { "epoch": 0.002, "grad_norm": 4.0625, "learning_rate": 9.75e-06, "loss": 1.9987, "step": 40 }, { "epoch": 0.0025, "grad_norm": 2.5625, "learning_rate": 9.999994983467545e-06, "loss": 1.8596, "step": 50 }, { "epoch": 0.003, "grad_norm": 2.21875, "learning_rate": 9.99997764238062e-06, "loss": 1.7805, "step": 60 }, { "epoch": 0.0035, "grad_norm": 1.9296875, "learning_rate": 9.999947914849675e-06, "loss": 1.7248, "step": 70 }, { "epoch": 0.004, "grad_norm": 2.03125, "learning_rate": 9.999905800948354e-06, "loss": 1.6839, "step": 80 }, { "epoch": 0.0045, "grad_norm": 1.765625, "learning_rate": 9.999851300780988e-06, "loss": 1.668, "step": 90 }, { "epoch": 0.005, "grad_norm": 1.8828125, "learning_rate": 9.999784414482586e-06, "loss": 1.648, "step": 100 }, { "epoch": 0.0055, "grad_norm": 1.703125, "learning_rate": 9.99970514221885e-06, "loss": 1.6319, "step": 110 }, { "epoch": 0.006, "grad_norm": 1.9140625, "learning_rate": 9.999613484186158e-06, "loss": 1.6239, "step": 120 }, { "epoch": 0.0065, "grad_norm": 1.9375, "learning_rate": 9.999509440611575e-06, "loss": 1.6207, "step": 130 }, { "epoch": 0.007, "grad_norm": 2.140625, "learning_rate": 9.999393011752848e-06, "loss": 1.6432, "step": 140 }, { "epoch": 0.0075, "grad_norm": 1.8125, "learning_rate": 9.999264197898409e-06, "loss": 1.603, "step": 150 }, { "epoch": 0.008, "grad_norm": 1.8359375, "learning_rate": 9.999122999367365e-06, "loss": 1.5955, "step": 160 }, { "epoch": 0.0085, "grad_norm": 1.7109375, "learning_rate": 9.99896941650951e-06, "loss": 1.5849, "step": 170 }, { "epoch": 0.009, "grad_norm": 1.8515625, "learning_rate": 9.998803449705312e-06, "loss": 1.5696, "step": 180 }, { "epoch": 0.0095, "grad_norm": 1.96875, "learning_rate": 9.998625099365922e-06, "loss": 1.5882, "step": 190 }, { "epoch": 0.01, "grad_norm": 1.9609375, "learning_rate": 9.998434365933167e-06, "loss": 1.5987, "step": 200 }, { "epoch": 0.0105, "grad_norm": 1.953125, "learning_rate": 9.99823124987955e-06, "loss": 1.5687, "step": 210 }, { "epoch": 0.011, "grad_norm": 1.8828125, "learning_rate": 9.998015751708252e-06, "loss": 1.5758, "step": 220 }, { "epoch": 0.0115, "grad_norm": 1.6796875, "learning_rate": 9.997787871953126e-06, "loss": 1.5673, "step": 230 }, { "epoch": 0.012, "grad_norm": 1.9375, "learning_rate": 9.997547611178699e-06, "loss": 1.5613, "step": 240 }, { "epoch": 0.0125, "grad_norm": 1.7734375, "learning_rate": 9.997294969980168e-06, "loss": 1.5688, "step": 250 }, { "epoch": 0.013, "grad_norm": 1.765625, "learning_rate": 9.997029948983402e-06, "loss": 1.5537, "step": 260 }, { "epoch": 0.0135, "grad_norm": 3.109375, "learning_rate": 9.996752548844935e-06, "loss": 1.5509, "step": 270 }, { "epoch": 0.014, "grad_norm": 1.8359375, "learning_rate": 9.996462770251973e-06, "loss": 1.5634, "step": 280 }, { "epoch": 0.0145, "grad_norm": 1.7421875, "learning_rate": 9.996160613922385e-06, "loss": 1.5546, "step": 290 }, { "epoch": 0.015, "grad_norm": 3.34375, "learning_rate": 9.995846080604701e-06, "loss": 1.535, "step": 300 }, { "epoch": 0.0155, "grad_norm": 1.9296875, "learning_rate": 9.995519171078114e-06, "loss": 1.5545, "step": 310 }, { "epoch": 0.016, "grad_norm": 1.8203125, "learning_rate": 9.99517988615248e-06, "loss": 1.547, "step": 320 }, { "epoch": 0.0165, "grad_norm": 1.84375, "learning_rate": 9.994828226668305e-06, "loss": 1.5404, "step": 330 }, { "epoch": 0.017, "grad_norm": 1.7109375, "learning_rate": 9.994464193496758e-06, "loss": 1.5442, "step": 340 }, { "epoch": 0.0175, "grad_norm": 1.734375, "learning_rate": 9.99408778753966e-06, "loss": 1.5387, "step": 350 }, { "epoch": 0.018, "grad_norm": 1.859375, "learning_rate": 9.993699009729476e-06, "loss": 1.5318, "step": 360 }, { "epoch": 0.0185, "grad_norm": 1.90625, "learning_rate": 9.99329786102933e-06, "loss": 1.5244, "step": 370 }, { "epoch": 0.019, "grad_norm": 2.015625, "learning_rate": 9.992884342432984e-06, "loss": 1.5448, "step": 380 }, { "epoch": 0.0195, "grad_norm": 1.8203125, "learning_rate": 9.99245845496485e-06, "loss": 1.5127, "step": 390 }, { "epoch": 0.02, "grad_norm": 1.8046875, "learning_rate": 9.992020199679982e-06, "loss": 1.5237, "step": 400 }, { "epoch": 0.0205, "grad_norm": 1.7734375, "learning_rate": 9.991569577664062e-06, "loss": 1.5292, "step": 410 }, { "epoch": 0.021, "grad_norm": 1.859375, "learning_rate": 9.991106590033422e-06, "loss": 1.5257, "step": 420 }, { "epoch": 0.0215, "grad_norm": 1.703125, "learning_rate": 9.99063123793502e-06, "loss": 1.5143, "step": 430 }, { "epoch": 0.022, "grad_norm": 1.65625, "learning_rate": 9.990143522546443e-06, "loss": 1.5321, "step": 440 }, { "epoch": 0.0225, "grad_norm": 1.734375, "learning_rate": 9.989643445075911e-06, "loss": 1.5312, "step": 450 }, { "epoch": 0.023, "grad_norm": 1.8359375, "learning_rate": 9.989131006762268e-06, "loss": 1.5344, "step": 460 }, { "epoch": 0.0235, "grad_norm": 1.703125, "learning_rate": 9.988606208874972e-06, "loss": 1.5024, "step": 470 }, { "epoch": 0.024, "grad_norm": 1.7734375, "learning_rate": 9.988069052714106e-06, "loss": 1.5253, "step": 480 }, { "epoch": 0.0245, "grad_norm": 1.9453125, "learning_rate": 9.98751953961037e-06, "loss": 1.5123, "step": 490 }, { "epoch": 0.025, "grad_norm": 1.875, "learning_rate": 9.98695767092507e-06, "loss": 1.4938, "step": 500 }, { "epoch": 0.0255, "grad_norm": 1.7109375, "learning_rate": 9.986383448050126e-06, "loss": 1.5022, "step": 510 }, { "epoch": 0.026, "grad_norm": 1.7421875, "learning_rate": 9.985796872408056e-06, "loss": 1.509, "step": 520 }, { "epoch": 0.0265, "grad_norm": 1.6953125, "learning_rate": 9.985197945451988e-06, "loss": 1.5177, "step": 530 }, { "epoch": 0.027, "grad_norm": 1.8671875, "learning_rate": 9.984586668665641e-06, "loss": 1.495, "step": 540 }, { "epoch": 0.0275, "grad_norm": 1.96875, "learning_rate": 9.983963043563331e-06, "loss": 1.5262, "step": 550 }, { "epoch": 0.028, "grad_norm": 1.875, "learning_rate": 9.983327071689967e-06, "loss": 1.5132, "step": 560 }, { "epoch": 0.0285, "grad_norm": 1.7265625, "learning_rate": 9.98267875462104e-06, "loss": 1.5021, "step": 570 }, { "epoch": 0.029, "grad_norm": 1.7734375, "learning_rate": 9.982018093962625e-06, "loss": 1.5059, "step": 580 }, { "epoch": 0.0295, "grad_norm": 2.375, "learning_rate": 9.981345091351377e-06, "loss": 1.4909, "step": 590 }, { "epoch": 0.03, "grad_norm": 1.71875, "learning_rate": 9.980659748454526e-06, "loss": 1.4901, "step": 600 }, { "epoch": 0.0305, "grad_norm": 1.7890625, "learning_rate": 9.979962066969873e-06, "loss": 1.4913, "step": 610 }, { "epoch": 0.031, "grad_norm": 1.9140625, "learning_rate": 9.979252048625783e-06, "loss": 1.4854, "step": 620 }, { "epoch": 0.0315, "grad_norm": 1.78125, "learning_rate": 9.978529695181184e-06, "loss": 1.4923, "step": 630 }, { "epoch": 0.032, "grad_norm": 1.8984375, "learning_rate": 9.977795008425561e-06, "loss": 1.5017, "step": 640 }, { "epoch": 0.0325, "grad_norm": 1.796875, "learning_rate": 9.977047990178957e-06, "loss": 1.4711, "step": 650 }, { "epoch": 0.033, "grad_norm": 1.8046875, "learning_rate": 9.976288642291958e-06, "loss": 1.4979, "step": 660 }, { "epoch": 0.0335, "grad_norm": 1.8828125, "learning_rate": 9.975516966645695e-06, "loss": 1.4777, "step": 670 }, { "epoch": 0.034, "grad_norm": 1.6484375, "learning_rate": 9.974732965151843e-06, "loss": 1.4931, "step": 680 }, { "epoch": 0.0345, "grad_norm": 2.03125, "learning_rate": 9.973936639752606e-06, "loss": 1.4981, "step": 690 }, { "epoch": 0.035, "grad_norm": 1.78125, "learning_rate": 9.973127992420723e-06, "loss": 1.4923, "step": 700 }, { "epoch": 0.0355, "grad_norm": 2.765625, "learning_rate": 9.972307025159454e-06, "loss": 1.493, "step": 710 }, { "epoch": 0.036, "grad_norm": 1.8828125, "learning_rate": 9.971473740002584e-06, "loss": 1.4959, "step": 720 }, { "epoch": 0.0365, "grad_norm": 1.578125, "learning_rate": 9.970628139014406e-06, "loss": 1.4942, "step": 730 }, { "epoch": 0.037, "grad_norm": 1.78125, "learning_rate": 9.969770224289732e-06, "loss": 1.4871, "step": 740 }, { "epoch": 0.0375, "grad_norm": 1.8203125, "learning_rate": 9.96889999795387e-06, "loss": 1.4747, "step": 750 }, { "epoch": 0.038, "grad_norm": 2.0, "learning_rate": 9.968017462162635e-06, "loss": 1.5057, "step": 760 }, { "epoch": 0.0385, "grad_norm": 1.8671875, "learning_rate": 9.967122619102329e-06, "loss": 1.4764, "step": 770 }, { "epoch": 0.039, "grad_norm": 1.984375, "learning_rate": 9.96621547098975e-06, "loss": 1.4936, "step": 780 }, { "epoch": 0.0395, "grad_norm": 1.7109375, "learning_rate": 9.965296020072175e-06, "loss": 1.4844, "step": 790 }, { "epoch": 0.04, "grad_norm": 2.109375, "learning_rate": 9.964364268627359e-06, "loss": 1.4903, "step": 800 }, { "epoch": 0.0405, "grad_norm": 1.84375, "learning_rate": 9.96342021896353e-06, "loss": 1.4683, "step": 810 }, { "epoch": 0.041, "grad_norm": 1.75, "learning_rate": 9.96246387341938e-06, "loss": 1.4628, "step": 820 }, { "epoch": 0.0415, "grad_norm": 1.71875, "learning_rate": 9.961495234364066e-06, "loss": 1.4783, "step": 830 }, { "epoch": 0.042, "grad_norm": 1.859375, "learning_rate": 9.9605143041972e-06, "loss": 1.4854, "step": 840 }, { "epoch": 0.0425, "grad_norm": 1.8828125, "learning_rate": 9.959521085348836e-06, "loss": 1.4898, "step": 850 }, { "epoch": 0.043, "grad_norm": 1.875, "learning_rate": 9.958515580279476e-06, "loss": 1.4646, "step": 860 }, { "epoch": 0.0435, "grad_norm": 2.828125, "learning_rate": 9.957497791480057e-06, "loss": 1.4726, "step": 870 }, { "epoch": 0.044, "grad_norm": 1.7421875, "learning_rate": 9.95646772147195e-06, "loss": 1.4933, "step": 880 }, { "epoch": 0.0445, "grad_norm": 2.171875, "learning_rate": 9.955425372806945e-06, "loss": 1.4766, "step": 890 }, { "epoch": 0.045, "grad_norm": 1.78125, "learning_rate": 9.954370748067253e-06, "loss": 1.4684, "step": 900 }, { "epoch": 0.0455, "grad_norm": 1.78125, "learning_rate": 9.953303849865493e-06, "loss": 1.4611, "step": 910 }, { "epoch": 0.046, "grad_norm": 1.8125, "learning_rate": 9.952224680844695e-06, "loss": 1.4559, "step": 920 }, { "epoch": 0.0465, "grad_norm": 1.7109375, "learning_rate": 9.951133243678284e-06, "loss": 1.4849, "step": 930 }, { "epoch": 0.047, "grad_norm": 1.953125, "learning_rate": 9.950029541070077e-06, "loss": 1.4784, "step": 940 }, { "epoch": 0.0475, "grad_norm": 1.8828125, "learning_rate": 9.948913575754276e-06, "loss": 1.4896, "step": 950 }, { "epoch": 0.048, "grad_norm": 2.03125, "learning_rate": 9.94778535049546e-06, "loss": 1.4708, "step": 960 }, { "epoch": 0.0485, "grad_norm": 1.984375, "learning_rate": 9.946644868088583e-06, "loss": 1.4923, "step": 970 }, { "epoch": 0.049, "grad_norm": 1.984375, "learning_rate": 9.945492131358962e-06, "loss": 1.4838, "step": 980 }, { "epoch": 0.0495, "grad_norm": 1.765625, "learning_rate": 9.944327143162273e-06, "loss": 1.4774, "step": 990 }, { "epoch": 0.05, "grad_norm": 1.8671875, "learning_rate": 9.943149906384538e-06, "loss": 1.4774, "step": 1000 }, { "epoch": 0.0505, "grad_norm": 1.796875, "learning_rate": 9.941960423942128e-06, "loss": 1.4631, "step": 1010 }, { "epoch": 0.051, "grad_norm": 1.75, "learning_rate": 9.940758698781746e-06, "loss": 1.477, "step": 1020 }, { "epoch": 0.0515, "grad_norm": 1.9296875, "learning_rate": 9.939544733880427e-06, "loss": 1.4922, "step": 1030 }, { "epoch": 0.052, "grad_norm": 1.9765625, "learning_rate": 9.938318532245523e-06, "loss": 1.4805, "step": 1040 }, { "epoch": 0.0525, "grad_norm": 1.859375, "learning_rate": 9.937080096914708e-06, "loss": 1.4702, "step": 1050 }, { "epoch": 0.053, "grad_norm": 1.7734375, "learning_rate": 9.935829430955956e-06, "loss": 1.4556, "step": 1060 }, { "epoch": 0.0535, "grad_norm": 1.8671875, "learning_rate": 9.93456653746754e-06, "loss": 1.4735, "step": 1070 }, { "epoch": 0.054, "grad_norm": 1.7890625, "learning_rate": 9.933291419578027e-06, "loss": 1.4712, "step": 1080 }, { "epoch": 0.0545, "grad_norm": 2.765625, "learning_rate": 9.932004080446265e-06, "loss": 1.4656, "step": 1090 }, { "epoch": 0.055, "grad_norm": 1.921875, "learning_rate": 9.93070452326138e-06, "loss": 1.4723, "step": 1100 }, { "epoch": 0.0555, "grad_norm": 1.875, "learning_rate": 9.929392751242767e-06, "loss": 1.4408, "step": 1110 }, { "epoch": 0.056, "grad_norm": 1.8125, "learning_rate": 9.928068767640078e-06, "loss": 1.4501, "step": 1120 }, { "epoch": 0.0565, "grad_norm": 1.8828125, "learning_rate": 9.926732575733214e-06, "loss": 1.4594, "step": 1130 }, { "epoch": 0.057, "grad_norm": 2.046875, "learning_rate": 9.925384178832326e-06, "loss": 1.4635, "step": 1140 }, { "epoch": 0.0575, "grad_norm": 1.875, "learning_rate": 9.924023580277798e-06, "loss": 1.4609, "step": 1150 }, { "epoch": 0.058, "grad_norm": 1.90625, "learning_rate": 9.922650783440237e-06, "loss": 1.4628, "step": 1160 }, { "epoch": 0.0585, "grad_norm": 1.875, "learning_rate": 9.921265791720476e-06, "loss": 1.473, "step": 1170 }, { "epoch": 0.059, "grad_norm": 1.6796875, "learning_rate": 9.919868608549553e-06, "loss": 1.4589, "step": 1180 }, { "epoch": 0.0595, "grad_norm": 1.8828125, "learning_rate": 9.918459237388711e-06, "loss": 1.4357, "step": 1190 }, { "epoch": 0.06, "grad_norm": 1.7890625, "learning_rate": 9.917037681729385e-06, "loss": 1.4593, "step": 1200 }, { "epoch": 0.0605, "grad_norm": 1.8515625, "learning_rate": 9.915603945093195e-06, "loss": 1.4408, "step": 1210 }, { "epoch": 0.061, "grad_norm": 1.8828125, "learning_rate": 9.914158031031934e-06, "loss": 1.4572, "step": 1220 }, { "epoch": 0.0615, "grad_norm": 1.65625, "learning_rate": 9.912699943127566e-06, "loss": 1.4544, "step": 1230 }, { "epoch": 0.062, "grad_norm": 1.828125, "learning_rate": 9.911229684992212e-06, "loss": 1.4503, "step": 1240 }, { "epoch": 0.0625, "grad_norm": 1.7890625, "learning_rate": 9.909747260268145e-06, "loss": 1.4635, "step": 1250 }, { "epoch": 0.063, "grad_norm": 1.8828125, "learning_rate": 9.908252672627772e-06, "loss": 1.4652, "step": 1260 }, { "epoch": 0.0635, "grad_norm": 1.796875, "learning_rate": 9.906745925773636e-06, "loss": 1.4537, "step": 1270 }, { "epoch": 0.064, "grad_norm": 1.8046875, "learning_rate": 9.905227023438401e-06, "loss": 1.46, "step": 1280 }, { "epoch": 0.0645, "grad_norm": 1.8671875, "learning_rate": 9.903695969384843e-06, "loss": 1.4629, "step": 1290 }, { "epoch": 0.065, "grad_norm": 1.890625, "learning_rate": 9.902152767405843e-06, "loss": 1.4554, "step": 1300 }, { "epoch": 0.0655, "grad_norm": 1.828125, "learning_rate": 9.900597421324376e-06, "loss": 1.4419, "step": 1310 }, { "epoch": 0.066, "grad_norm": 1.8671875, "learning_rate": 9.899029934993498e-06, "loss": 1.4363, "step": 1320 }, { "epoch": 0.0665, "grad_norm": 1.8203125, "learning_rate": 9.897450312296344e-06, "loss": 1.4534, "step": 1330 }, { "epoch": 0.067, "grad_norm": 1.8671875, "learning_rate": 9.895858557146112e-06, "loss": 1.4462, "step": 1340 }, { "epoch": 0.0675, "grad_norm": 2.203125, "learning_rate": 9.89425467348606e-06, "loss": 1.4628, "step": 1350 }, { "epoch": 0.068, "grad_norm": 2.09375, "learning_rate": 9.892638665289485e-06, "loss": 1.4509, "step": 1360 }, { "epoch": 0.0685, "grad_norm": 1.625, "learning_rate": 9.891010536559725e-06, "loss": 1.4737, "step": 1370 }, { "epoch": 0.069, "grad_norm": 1.84375, "learning_rate": 9.889370291330145e-06, "loss": 1.4634, "step": 1380 }, { "epoch": 0.0695, "grad_norm": 1.75, "learning_rate": 9.887717933664123e-06, "loss": 1.4451, "step": 1390 }, { "epoch": 0.07, "grad_norm": 1.59375, "learning_rate": 9.886053467655044e-06, "loss": 1.4153, "step": 1400 }, { "epoch": 0.0705, "grad_norm": 1.796875, "learning_rate": 9.88437689742629e-06, "loss": 1.4816, "step": 1410 }, { "epoch": 0.071, "grad_norm": 1.8046875, "learning_rate": 9.882688227131229e-06, "loss": 1.4435, "step": 1420 }, { "epoch": 0.0715, "grad_norm": 1.8984375, "learning_rate": 9.880987460953202e-06, "loss": 1.461, "step": 1430 }, { "epoch": 0.072, "grad_norm": 2.046875, "learning_rate": 9.879274603105518e-06, "loss": 1.4451, "step": 1440 }, { "epoch": 0.0725, "grad_norm": 1.7578125, "learning_rate": 9.877549657831439e-06, "loss": 1.4602, "step": 1450 }, { "epoch": 0.073, "grad_norm": 1.8046875, "learning_rate": 9.875812629404173e-06, "loss": 1.4409, "step": 1460 }, { "epoch": 0.0735, "grad_norm": 1.796875, "learning_rate": 9.874063522126858e-06, "loss": 1.4661, "step": 1470 }, { "epoch": 0.074, "grad_norm": 1.7734375, "learning_rate": 9.872302340332559e-06, "loss": 1.4349, "step": 1480 }, { "epoch": 0.0745, "grad_norm": 2.171875, "learning_rate": 9.870529088384252e-06, "loss": 1.448, "step": 1490 }, { "epoch": 0.075, "grad_norm": 1.78125, "learning_rate": 9.868743770674811e-06, "loss": 1.4292, "step": 1500 }, { "epoch": 0.0755, "grad_norm": 1.859375, "learning_rate": 9.866946391627006e-06, "loss": 1.4519, "step": 1510 }, { "epoch": 0.076, "grad_norm": 4.0, "learning_rate": 9.865136955693485e-06, "loss": 1.455, "step": 1520 }, { "epoch": 0.0765, "grad_norm": 1.7734375, "learning_rate": 9.863315467356758e-06, "loss": 1.4385, "step": 1530 }, { "epoch": 0.077, "grad_norm": 1.875, "learning_rate": 9.861481931129202e-06, "loss": 1.4319, "step": 1540 }, { "epoch": 0.0775, "grad_norm": 1.7265625, "learning_rate": 9.859636351553039e-06, "loss": 1.4291, "step": 1550 }, { "epoch": 0.078, "grad_norm": 1.5078125, "learning_rate": 9.857778733200315e-06, "loss": 1.437, "step": 1560 }, { "epoch": 0.0785, "grad_norm": 1.75, "learning_rate": 9.855909080672915e-06, "loss": 1.4501, "step": 1570 }, { "epoch": 0.079, "grad_norm": 1.8203125, "learning_rate": 9.854027398602526e-06, "loss": 1.4378, "step": 1580 }, { "epoch": 0.0795, "grad_norm": 1.890625, "learning_rate": 9.852133691650639e-06, "loss": 1.4439, "step": 1590 }, { "epoch": 0.08, "grad_norm": 2.09375, "learning_rate": 9.850227964508533e-06, "loss": 1.461, "step": 1600 }, { "epoch": 0.0805, "grad_norm": 1.8984375, "learning_rate": 9.848310221897268e-06, "loss": 1.4455, "step": 1610 }, { "epoch": 0.081, "grad_norm": 1.6875, "learning_rate": 9.846380468567664e-06, "loss": 1.4318, "step": 1620 }, { "epoch": 0.0815, "grad_norm": 2.140625, "learning_rate": 9.844438709300302e-06, "loss": 1.4279, "step": 1630 }, { "epoch": 0.082, "grad_norm": 2.140625, "learning_rate": 9.842484948905502e-06, "loss": 1.4355, "step": 1640 }, { "epoch": 0.0825, "grad_norm": 1.8515625, "learning_rate": 9.840519192223313e-06, "loss": 1.4413, "step": 1650 }, { "epoch": 0.083, "grad_norm": 1.7578125, "learning_rate": 9.838541444123502e-06, "loss": 1.4405, "step": 1660 }, { "epoch": 0.0835, "grad_norm": 2.015625, "learning_rate": 9.836551709505548e-06, "loss": 1.447, "step": 1670 }, { "epoch": 0.084, "grad_norm": 1.8359375, "learning_rate": 9.834549993298618e-06, "loss": 1.439, "step": 1680 }, { "epoch": 0.0845, "grad_norm": 2.0, "learning_rate": 9.832536300461563e-06, "loss": 1.4262, "step": 1690 }, { "epoch": 0.085, "grad_norm": 1.78125, "learning_rate": 9.830510635982905e-06, "loss": 1.4292, "step": 1700 }, { "epoch": 0.0855, "grad_norm": 1.875, "learning_rate": 9.828473004880824e-06, "loss": 1.4553, "step": 1710 }, { "epoch": 0.086, "grad_norm": 1.7734375, "learning_rate": 9.826423412203142e-06, "loss": 1.4396, "step": 1720 }, { "epoch": 0.0865, "grad_norm": 1.7421875, "learning_rate": 9.824361863027311e-06, "loss": 1.4476, "step": 1730 }, { "epoch": 0.087, "grad_norm": 1.8984375, "learning_rate": 9.822288362460412e-06, "loss": 1.4227, "step": 1740 }, { "epoch": 0.0875, "grad_norm": 1.984375, "learning_rate": 9.820202915639125e-06, "loss": 1.4346, "step": 1750 }, { "epoch": 0.088, "grad_norm": 1.7734375, "learning_rate": 9.818105527729726e-06, "loss": 1.4311, "step": 1760 }, { "epoch": 0.0885, "grad_norm": 1.890625, "learning_rate": 9.815996203928077e-06, "loss": 1.4479, "step": 1770 }, { "epoch": 0.089, "grad_norm": 1.828125, "learning_rate": 9.813874949459605e-06, "loss": 1.4597, "step": 1780 }, { "epoch": 0.0895, "grad_norm": 1.8046875, "learning_rate": 9.811741769579292e-06, "loss": 1.4456, "step": 1790 }, { "epoch": 0.09, "grad_norm": 1.9375, "learning_rate": 9.809596669571663e-06, "loss": 1.4433, "step": 1800 }, { "epoch": 0.0905, "grad_norm": 1.7265625, "learning_rate": 9.807439654750778e-06, "loss": 1.4297, "step": 1810 }, { "epoch": 0.091, "grad_norm": 1.65625, "learning_rate": 9.805270730460206e-06, "loss": 1.4397, "step": 1820 }, { "epoch": 0.0915, "grad_norm": 2.03125, "learning_rate": 9.803089902073024e-06, "loss": 1.422, "step": 1830 }, { "epoch": 0.092, "grad_norm": 1.921875, "learning_rate": 9.800897174991799e-06, "loss": 1.4306, "step": 1840 }, { "epoch": 0.0925, "grad_norm": 1.8984375, "learning_rate": 9.798692554648573e-06, "loss": 1.4306, "step": 1850 }, { "epoch": 0.093, "grad_norm": 2.25, "learning_rate": 9.79647604650485e-06, "loss": 1.4307, "step": 1860 }, { "epoch": 0.0935, "grad_norm": 1.796875, "learning_rate": 9.794247656051587e-06, "loss": 1.4489, "step": 1870 }, { "epoch": 0.094, "grad_norm": 1.9375, "learning_rate": 9.792007388809175e-06, "loss": 1.4194, "step": 1880 }, { "epoch": 0.0945, "grad_norm": 2.015625, "learning_rate": 9.789755250327429e-06, "loss": 1.4413, "step": 1890 }, { "epoch": 0.095, "grad_norm": 1.84375, "learning_rate": 9.787491246185569e-06, "loss": 1.4346, "step": 1900 }, { "epoch": 0.0955, "grad_norm": 1.90625, "learning_rate": 9.785215381992213e-06, "loss": 1.4467, "step": 1910 }, { "epoch": 0.096, "grad_norm": 1.8203125, "learning_rate": 9.78292766338536e-06, "loss": 1.4246, "step": 1920 }, { "epoch": 0.0965, "grad_norm": 1.84375, "learning_rate": 9.780628096032376e-06, "loss": 1.4231, "step": 1930 }, { "epoch": 0.097, "grad_norm": 1.703125, "learning_rate": 9.778316685629977e-06, "loss": 1.4223, "step": 1940 }, { "epoch": 0.0975, "grad_norm": 1.8828125, "learning_rate": 9.77599343790422e-06, "loss": 1.4363, "step": 1950 }, { "epoch": 0.098, "grad_norm": 1.8203125, "learning_rate": 9.773658358610485e-06, "loss": 1.4396, "step": 1960 }, { "epoch": 0.0985, "grad_norm": 2.015625, "learning_rate": 9.771311453533467e-06, "loss": 1.4216, "step": 1970 }, { "epoch": 0.099, "grad_norm": 1.921875, "learning_rate": 9.76895272848715e-06, "loss": 1.4367, "step": 1980 }, { "epoch": 0.0995, "grad_norm": 1.90625, "learning_rate": 9.766582189314808e-06, "loss": 1.4302, "step": 1990 }, { "epoch": 0.1, "grad_norm": 1.6796875, "learning_rate": 9.764199841888973e-06, "loss": 1.4239, "step": 2000 }, { "epoch": 0.1005, "grad_norm": 1.59375, "learning_rate": 9.761805692111437e-06, "loss": 1.4341, "step": 2010 }, { "epoch": 0.101, "grad_norm": 1.8984375, "learning_rate": 9.759399745913225e-06, "loss": 1.4312, "step": 2020 }, { "epoch": 0.1015, "grad_norm": 2.734375, "learning_rate": 9.756982009254586e-06, "loss": 1.4409, "step": 2030 }, { "epoch": 0.102, "grad_norm": 1.96875, "learning_rate": 9.754552488124982e-06, "loss": 1.4355, "step": 2040 }, { "epoch": 0.1025, "grad_norm": 1.7890625, "learning_rate": 9.752111188543064e-06, "loss": 1.4157, "step": 2050 }, { "epoch": 0.103, "grad_norm": 1.9453125, "learning_rate": 9.749658116556663e-06, "loss": 1.4389, "step": 2060 }, { "epoch": 0.1035, "grad_norm": 1.8828125, "learning_rate": 9.747193278242772e-06, "loss": 1.4249, "step": 2070 }, { "epoch": 0.104, "grad_norm": 1.8515625, "learning_rate": 9.744716679707537e-06, "loss": 1.4353, "step": 2080 }, { "epoch": 0.1045, "grad_norm": 1.671875, "learning_rate": 9.742228327086238e-06, "loss": 1.4232, "step": 2090 }, { "epoch": 0.105, "grad_norm": 1.71875, "learning_rate": 9.739728226543264e-06, "loss": 1.4376, "step": 2100 }, { "epoch": 0.1055, "grad_norm": 2.015625, "learning_rate": 9.737216384272119e-06, "loss": 1.4127, "step": 2110 }, { "epoch": 0.106, "grad_norm": 1.9609375, "learning_rate": 9.734692806495389e-06, "loss": 1.4154, "step": 2120 }, { "epoch": 0.1065, "grad_norm": 1.703125, "learning_rate": 9.732157499464731e-06, "loss": 1.432, "step": 2130 }, { "epoch": 0.107, "grad_norm": 1.9140625, "learning_rate": 9.72961046946086e-06, "loss": 1.4418, "step": 2140 }, { "epoch": 0.1075, "grad_norm": 1.84375, "learning_rate": 9.727051722793537e-06, "loss": 1.4259, "step": 2150 }, { "epoch": 0.108, "grad_norm": 2.09375, "learning_rate": 9.724481265801542e-06, "loss": 1.4203, "step": 2160 }, { "epoch": 0.1085, "grad_norm": 1.8359375, "learning_rate": 9.721899104852668e-06, "loss": 1.4322, "step": 2170 }, { "epoch": 0.109, "grad_norm": 1.8984375, "learning_rate": 9.719305246343702e-06, "loss": 1.4118, "step": 2180 }, { "epoch": 0.1095, "grad_norm": 1.9296875, "learning_rate": 9.716699696700412e-06, "loss": 1.4385, "step": 2190 }, { "epoch": 0.11, "grad_norm": 2.125, "learning_rate": 9.714082462377522e-06, "loss": 1.4076, "step": 2200 }, { "epoch": 0.1105, "grad_norm": 1.859375, "learning_rate": 9.711453549858707e-06, "loss": 1.4301, "step": 2210 }, { "epoch": 0.111, "grad_norm": 1.84375, "learning_rate": 9.708812965656573e-06, "loss": 1.4192, "step": 2220 }, { "epoch": 0.1115, "grad_norm": 1.9296875, "learning_rate": 9.706160716312639e-06, "loss": 1.4412, "step": 2230 }, { "epoch": 0.112, "grad_norm": 1.96875, "learning_rate": 9.703496808397322e-06, "loss": 1.4, "step": 2240 }, { "epoch": 0.1125, "grad_norm": 1.96875, "learning_rate": 9.70082124850992e-06, "loss": 1.4235, "step": 2250 }, { "epoch": 0.113, "grad_norm": 1.9140625, "learning_rate": 9.698134043278595e-06, "loss": 1.4272, "step": 2260 }, { "epoch": 0.1135, "grad_norm": 1.8984375, "learning_rate": 9.695435199360366e-06, "loss": 1.402, "step": 2270 }, { "epoch": 0.114, "grad_norm": 1.6328125, "learning_rate": 9.692724723441074e-06, "loss": 1.4151, "step": 2280 }, { "epoch": 0.1145, "grad_norm": 1.8984375, "learning_rate": 9.690002622235381e-06, "loss": 1.4108, "step": 2290 }, { "epoch": 0.115, "grad_norm": 1.921875, "learning_rate": 9.687268902486751e-06, "loss": 1.4239, "step": 2300 }, { "epoch": 0.1155, "grad_norm": 1.953125, "learning_rate": 9.684523570967423e-06, "loss": 1.43, "step": 2310 }, { "epoch": 0.116, "grad_norm": 2.0625, "learning_rate": 9.681766634478411e-06, "loss": 1.4205, "step": 2320 }, { "epoch": 0.1165, "grad_norm": 2.03125, "learning_rate": 9.67899809984947e-06, "loss": 1.4284, "step": 2330 }, { "epoch": 0.117, "grad_norm": 1.9296875, "learning_rate": 9.676217973939095e-06, "loss": 1.4414, "step": 2340 }, { "epoch": 0.1175, "grad_norm": 1.8046875, "learning_rate": 9.673426263634486e-06, "loss": 1.4231, "step": 2350 }, { "epoch": 0.118, "grad_norm": 2.0, "learning_rate": 9.67062297585155e-06, "loss": 1.4218, "step": 2360 }, { "epoch": 0.1185, "grad_norm": 1.9375, "learning_rate": 9.66780811753487e-06, "loss": 1.4088, "step": 2370 }, { "epoch": 0.119, "grad_norm": 1.8515625, "learning_rate": 9.664981695657697e-06, "loss": 1.4358, "step": 2380 }, { "epoch": 0.1195, "grad_norm": 1.78125, "learning_rate": 9.66214371722192e-06, "loss": 1.4208, "step": 2390 }, { "epoch": 0.12, "grad_norm": 1.8046875, "learning_rate": 9.659294189258068e-06, "loss": 1.4066, "step": 2400 }, { "epoch": 0.1205, "grad_norm": 2.15625, "learning_rate": 9.656433118825275e-06, "loss": 1.4248, "step": 2410 }, { "epoch": 0.121, "grad_norm": 1.6953125, "learning_rate": 9.653560513011269e-06, "loss": 1.4109, "step": 2420 }, { "epoch": 0.1215, "grad_norm": 1.8125, "learning_rate": 9.650676378932356e-06, "loss": 1.418, "step": 2430 }, { "epoch": 0.122, "grad_norm": 2.0, "learning_rate": 9.647780723733403e-06, "loss": 1.4109, "step": 2440 }, { "epoch": 0.1225, "grad_norm": 2.046875, "learning_rate": 9.644873554587815e-06, "loss": 1.4271, "step": 2450 }, { "epoch": 0.123, "grad_norm": 1.921875, "learning_rate": 9.641954878697522e-06, "loss": 1.4172, "step": 2460 }, { "epoch": 0.1235, "grad_norm": 2.03125, "learning_rate": 9.63902470329296e-06, "loss": 1.437, "step": 2470 }, { "epoch": 0.124, "grad_norm": 1.78125, "learning_rate": 9.63608303563305e-06, "loss": 1.4102, "step": 2480 }, { "epoch": 0.1245, "grad_norm": 1.9765625, "learning_rate": 9.633129883005188e-06, "loss": 1.4116, "step": 2490 }, { "epoch": 0.125, "grad_norm": 1.9296875, "learning_rate": 9.630165252725216e-06, "loss": 1.4378, "step": 2500 }, { "epoch": 0.1255, "grad_norm": 2.03125, "learning_rate": 9.627189152137412e-06, "loss": 1.4217, "step": 2510 }, { "epoch": 0.126, "grad_norm": 2.015625, "learning_rate": 9.624201588614475e-06, "loss": 1.4103, "step": 2520 }, { "epoch": 0.1265, "grad_norm": 1.8515625, "learning_rate": 9.621202569557489e-06, "loss": 1.4236, "step": 2530 }, { "epoch": 0.127, "grad_norm": 1.953125, "learning_rate": 9.618192102395926e-06, "loss": 1.4246, "step": 2540 }, { "epoch": 0.1275, "grad_norm": 1.890625, "learning_rate": 9.615170194587617e-06, "loss": 1.4029, "step": 2550 }, { "epoch": 0.128, "grad_norm": 1.9140625, "learning_rate": 9.612136853618734e-06, "loss": 1.4113, "step": 2560 }, { "epoch": 0.1285, "grad_norm": 2.234375, "learning_rate": 9.60909208700377e-06, "loss": 1.4378, "step": 2570 }, { "epoch": 0.129, "grad_norm": 1.8125, "learning_rate": 9.606035902285528e-06, "loss": 1.4261, "step": 2580 }, { "epoch": 0.1295, "grad_norm": 1.890625, "learning_rate": 9.60296830703509e-06, "loss": 1.4187, "step": 2590 }, { "epoch": 0.13, "grad_norm": 2.0, "learning_rate": 9.599889308851814e-06, "loss": 1.4126, "step": 2600 }, { "epoch": 0.1305, "grad_norm": 2.09375, "learning_rate": 9.596798915363299e-06, "loss": 1.4195, "step": 2610 }, { "epoch": 0.131, "grad_norm": 2.125, "learning_rate": 9.593697134225375e-06, "loss": 1.4385, "step": 2620 }, { "epoch": 0.1315, "grad_norm": 2.3125, "learning_rate": 9.590583973122088e-06, "loss": 1.4255, "step": 2630 }, { "epoch": 0.132, "grad_norm": 1.9375, "learning_rate": 9.58745943976567e-06, "loss": 1.4182, "step": 2640 }, { "epoch": 0.1325, "grad_norm": 1.9921875, "learning_rate": 9.584323541896525e-06, "loss": 1.4128, "step": 2650 }, { "epoch": 0.133, "grad_norm": 1.8046875, "learning_rate": 9.581176287283216e-06, "loss": 1.4099, "step": 2660 }, { "epoch": 0.1335, "grad_norm": 1.8046875, "learning_rate": 9.578017683722434e-06, "loss": 1.4107, "step": 2670 }, { "epoch": 0.134, "grad_norm": 1.9375, "learning_rate": 9.574847739038986e-06, "loss": 1.4147, "step": 2680 }, { "epoch": 0.1345, "grad_norm": 1.796875, "learning_rate": 9.57166646108578e-06, "loss": 1.412, "step": 2690 }, { "epoch": 0.135, "grad_norm": 2.046875, "learning_rate": 9.568473857743792e-06, "loss": 1.4355, "step": 2700 }, { "epoch": 0.1355, "grad_norm": 1.796875, "learning_rate": 9.565269936922059e-06, "loss": 1.3911, "step": 2710 }, { "epoch": 0.136, "grad_norm": 1.8125, "learning_rate": 9.562054706557653e-06, "loss": 1.4233, "step": 2720 }, { "epoch": 0.1365, "grad_norm": 1.859375, "learning_rate": 9.558828174615665e-06, "loss": 1.4246, "step": 2730 }, { "epoch": 0.137, "grad_norm": 1.8046875, "learning_rate": 9.55559034908918e-06, "loss": 1.4048, "step": 2740 }, { "epoch": 0.1375, "grad_norm": 1.9453125, "learning_rate": 9.552341237999267e-06, "loss": 1.4227, "step": 2750 }, { "epoch": 0.138, "grad_norm": 1.9375, "learning_rate": 9.549080849394944e-06, "loss": 1.398, "step": 2760 }, { "epoch": 0.1385, "grad_norm": 1.8828125, "learning_rate": 9.545809191353173e-06, "loss": 1.4165, "step": 2770 }, { "epoch": 0.139, "grad_norm": 1.96875, "learning_rate": 9.542526271978832e-06, "loss": 1.4254, "step": 2780 }, { "epoch": 0.1395, "grad_norm": 2.203125, "learning_rate": 9.539232099404698e-06, "loss": 1.4186, "step": 2790 }, { "epoch": 0.14, "grad_norm": 1.90625, "learning_rate": 9.535926681791423e-06, "loss": 1.4096, "step": 2800 }, { "epoch": 0.1405, "grad_norm": 1.8125, "learning_rate": 9.532610027327517e-06, "loss": 1.4103, "step": 2810 }, { "epoch": 0.141, "grad_norm": 1.875, "learning_rate": 9.52928214422933e-06, "loss": 1.4008, "step": 2820 }, { "epoch": 0.1415, "grad_norm": 1.8125, "learning_rate": 9.525943040741025e-06, "loss": 1.4017, "step": 2830 }, { "epoch": 0.142, "grad_norm": 2.53125, "learning_rate": 9.522592725134563e-06, "loss": 1.4084, "step": 2840 }, { "epoch": 0.1425, "grad_norm": 1.828125, "learning_rate": 9.51923120570968e-06, "loss": 1.4051, "step": 2850 }, { "epoch": 0.143, "grad_norm": 2.03125, "learning_rate": 9.515858490793866e-06, "loss": 1.4235, "step": 2860 }, { "epoch": 0.1435, "grad_norm": 1.8359375, "learning_rate": 9.51247458874235e-06, "loss": 1.4221, "step": 2870 }, { "epoch": 0.144, "grad_norm": 2.015625, "learning_rate": 9.509079507938073e-06, "loss": 1.4097, "step": 2880 }, { "epoch": 0.1445, "grad_norm": 3.203125, "learning_rate": 9.505673256791666e-06, "loss": 1.4237, "step": 2890 }, { "epoch": 0.145, "grad_norm": 1.796875, "learning_rate": 9.502255843741434e-06, "loss": 1.4163, "step": 2900 }, { "epoch": 0.1455, "grad_norm": 1.8125, "learning_rate": 9.498827277253335e-06, "loss": 1.4011, "step": 2910 }, { "epoch": 0.146, "grad_norm": 1.8515625, "learning_rate": 9.495387565820956e-06, "loss": 1.405, "step": 2920 }, { "epoch": 0.1465, "grad_norm": 1.765625, "learning_rate": 9.491936717965497e-06, "loss": 1.4066, "step": 2930 }, { "epoch": 0.147, "grad_norm": 2.046875, "learning_rate": 9.488474742235739e-06, "loss": 1.4332, "step": 2940 }, { "epoch": 0.1475, "grad_norm": 1.8125, "learning_rate": 9.485001647208033e-06, "loss": 1.3953, "step": 2950 }, { "epoch": 0.148, "grad_norm": 1.8671875, "learning_rate": 9.481517441486281e-06, "loss": 1.4037, "step": 2960 }, { "epoch": 0.1485, "grad_norm": 1.8828125, "learning_rate": 9.478022133701903e-06, "loss": 1.4288, "step": 2970 }, { "epoch": 0.149, "grad_norm": 3.8125, "learning_rate": 9.474515732513825e-06, "loss": 1.3995, "step": 2980 }, { "epoch": 0.1495, "grad_norm": 2.046875, "learning_rate": 9.470998246608454e-06, "loss": 1.414, "step": 2990 }, { "epoch": 0.15, "grad_norm": 1.7890625, "learning_rate": 9.467469684699658e-06, "loss": 1.4221, "step": 3000 }, { "epoch": 0.1505, "grad_norm": 1.890625, "learning_rate": 9.463930055528742e-06, "loss": 1.4131, "step": 3010 }, { "epoch": 0.151, "grad_norm": 1.796875, "learning_rate": 9.460379367864431e-06, "loss": 1.4179, "step": 3020 }, { "epoch": 0.1515, "grad_norm": 1.703125, "learning_rate": 9.45681763050284e-06, "loss": 1.4107, "step": 3030 }, { "epoch": 0.152, "grad_norm": 2.109375, "learning_rate": 9.453244852267465e-06, "loss": 1.4013, "step": 3040 }, { "epoch": 0.1525, "grad_norm": 2.15625, "learning_rate": 9.449661042009147e-06, "loss": 1.4168, "step": 3050 }, { "epoch": 0.153, "grad_norm": 1.7890625, "learning_rate": 9.446066208606056e-06, "loss": 1.4177, "step": 3060 }, { "epoch": 0.1535, "grad_norm": 1.7578125, "learning_rate": 9.442460360963674e-06, "loss": 1.3992, "step": 3070 }, { "epoch": 0.154, "grad_norm": 1.9609375, "learning_rate": 9.438843508014767e-06, "loss": 1.4302, "step": 3080 }, { "epoch": 0.1545, "grad_norm": 2.34375, "learning_rate": 9.435215658719366e-06, "loss": 1.4239, "step": 3090 }, { "epoch": 0.155, "grad_norm": 1.9453125, "learning_rate": 9.431576822064741e-06, "loss": 1.4246, "step": 3100 }, { "epoch": 0.1555, "grad_norm": 1.9453125, "learning_rate": 9.42792700706538e-06, "loss": 1.4102, "step": 3110 }, { "epoch": 0.156, "grad_norm": 1.953125, "learning_rate": 9.424266222762971e-06, "loss": 1.4214, "step": 3120 }, { "epoch": 0.1565, "grad_norm": 2.03125, "learning_rate": 9.420594478226374e-06, "loss": 1.4212, "step": 3130 }, { "epoch": 0.157, "grad_norm": 1.90625, "learning_rate": 9.416911782551602e-06, "loss": 1.4309, "step": 3140 }, { "epoch": 0.1575, "grad_norm": 1.890625, "learning_rate": 9.413218144861793e-06, "loss": 1.4032, "step": 3150 }, { "epoch": 0.158, "grad_norm": 1.8828125, "learning_rate": 9.409513574307199e-06, "loss": 1.4209, "step": 3160 }, { "epoch": 0.1585, "grad_norm": 3.1875, "learning_rate": 9.40579808006515e-06, "loss": 1.4033, "step": 3170 }, { "epoch": 0.159, "grad_norm": 1.8515625, "learning_rate": 9.402071671340045e-06, "loss": 1.4046, "step": 3180 }, { "epoch": 0.1595, "grad_norm": 1.9453125, "learning_rate": 9.398334357363308e-06, "loss": 1.4064, "step": 3190 }, { "epoch": 0.16, "grad_norm": 1.984375, "learning_rate": 9.39458614739339e-06, "loss": 1.43, "step": 3200 }, { "epoch": 0.1605, "grad_norm": 1.9375, "learning_rate": 9.390827050715732e-06, "loss": 1.4085, "step": 3210 }, { "epoch": 0.161, "grad_norm": 1.8671875, "learning_rate": 9.387057076642742e-06, "loss": 1.4242, "step": 3220 }, { "epoch": 0.1615, "grad_norm": 1.9296875, "learning_rate": 9.383276234513778e-06, "loss": 1.4033, "step": 3230 }, { "epoch": 0.162, "grad_norm": 1.7578125, "learning_rate": 9.379484533695118e-06, "loss": 1.4016, "step": 3240 }, { "epoch": 0.1625, "grad_norm": 1.953125, "learning_rate": 9.375681983579943e-06, "loss": 1.3968, "step": 3250 }, { "epoch": 0.163, "grad_norm": 1.8046875, "learning_rate": 9.371868593588311e-06, "loss": 1.4029, "step": 3260 }, { "epoch": 0.1635, "grad_norm": 1.9140625, "learning_rate": 9.368044373167129e-06, "loss": 1.4212, "step": 3270 }, { "epoch": 0.164, "grad_norm": 2.046875, "learning_rate": 9.364209331790141e-06, "loss": 1.4062, "step": 3280 }, { "epoch": 0.1645, "grad_norm": 2.15625, "learning_rate": 9.360363478957895e-06, "loss": 1.4177, "step": 3290 }, { "epoch": 0.165, "grad_norm": 2.015625, "learning_rate": 9.356506824197719e-06, "loss": 1.4034, "step": 3300 }, { "epoch": 0.1655, "grad_norm": 1.8828125, "learning_rate": 9.352639377063707e-06, "loss": 1.4244, "step": 3310 }, { "epoch": 0.166, "grad_norm": 1.921875, "learning_rate": 9.348761147136681e-06, "loss": 1.4016, "step": 3320 }, { "epoch": 0.1665, "grad_norm": 1.8046875, "learning_rate": 9.344872144024183e-06, "loss": 1.4078, "step": 3330 }, { "epoch": 0.167, "grad_norm": 1.765625, "learning_rate": 9.340972377360438e-06, "loss": 1.407, "step": 3340 }, { "epoch": 0.1675, "grad_norm": 2.015625, "learning_rate": 9.33706185680634e-06, "loss": 1.3963, "step": 3350 }, { "epoch": 0.168, "grad_norm": 1.96875, "learning_rate": 9.333140592049416e-06, "loss": 1.4062, "step": 3360 }, { "epoch": 0.1685, "grad_norm": 2.046875, "learning_rate": 9.32920859280382e-06, "loss": 1.3984, "step": 3370 }, { "epoch": 0.169, "grad_norm": 2.09375, "learning_rate": 9.325265868810291e-06, "loss": 1.412, "step": 3380 }, { "epoch": 0.1695, "grad_norm": 2.03125, "learning_rate": 9.321312429836139e-06, "loss": 1.3934, "step": 3390 }, { "epoch": 0.17, "grad_norm": 1.8671875, "learning_rate": 9.317348285675217e-06, "loss": 1.4004, "step": 3400 }, { "epoch": 0.1705, "grad_norm": 1.9453125, "learning_rate": 9.3133734461479e-06, "loss": 1.4276, "step": 3410 }, { "epoch": 0.171, "grad_norm": 1.6796875, "learning_rate": 9.309387921101058e-06, "loss": 1.3908, "step": 3420 }, { "epoch": 0.1715, "grad_norm": 1.765625, "learning_rate": 9.30539172040803e-06, "loss": 1.3957, "step": 3430 }, { "epoch": 0.172, "grad_norm": 2.046875, "learning_rate": 9.301384853968605e-06, "loss": 1.4109, "step": 3440 }, { "epoch": 0.1725, "grad_norm": 3.265625, "learning_rate": 9.297367331708993e-06, "loss": 1.4134, "step": 3450 }, { "epoch": 0.173, "grad_norm": 1.7734375, "learning_rate": 9.2933391635818e-06, "loss": 1.4043, "step": 3460 }, { "epoch": 0.1735, "grad_norm": 2.09375, "learning_rate": 9.289300359566005e-06, "loss": 1.4012, "step": 3470 }, { "epoch": 0.174, "grad_norm": 1.96875, "learning_rate": 9.285250929666941e-06, "loss": 1.4077, "step": 3480 }, { "epoch": 0.1745, "grad_norm": 1.9609375, "learning_rate": 9.281190883916258e-06, "loss": 1.4044, "step": 3490 }, { "epoch": 0.175, "grad_norm": 1.921875, "learning_rate": 9.277120232371906e-06, "loss": 1.3848, "step": 3500 }, { "epoch": 0.1755, "grad_norm": 2.125, "learning_rate": 9.27303898511811e-06, "loss": 1.4161, "step": 3510 }, { "epoch": 0.176, "grad_norm": 1.96875, "learning_rate": 9.268947152265343e-06, "loss": 1.4059, "step": 3520 }, { "epoch": 0.1765, "grad_norm": 1.8359375, "learning_rate": 9.264844743950305e-06, "loss": 1.4083, "step": 3530 }, { "epoch": 0.177, "grad_norm": 1.65625, "learning_rate": 9.260731770335888e-06, "loss": 1.386, "step": 3540 }, { "epoch": 0.1775, "grad_norm": 1.609375, "learning_rate": 9.256608241611163e-06, "loss": 1.4009, "step": 3550 }, { "epoch": 0.178, "grad_norm": 1.703125, "learning_rate": 9.252474167991347e-06, "loss": 1.404, "step": 3560 }, { "epoch": 0.1785, "grad_norm": 2.140625, "learning_rate": 9.24832955971778e-06, "loss": 1.3995, "step": 3570 }, { "epoch": 0.179, "grad_norm": 1.84375, "learning_rate": 9.244174427057897e-06, "loss": 1.4114, "step": 3580 }, { "epoch": 0.1795, "grad_norm": 1.8671875, "learning_rate": 9.240008780305211e-06, "loss": 1.399, "step": 3590 }, { "epoch": 0.18, "grad_norm": 1.8828125, "learning_rate": 9.235832629779277e-06, "loss": 1.408, "step": 3600 }, { "epoch": 0.1805, "grad_norm": 1.8984375, "learning_rate": 9.231645985825673e-06, "loss": 1.3894, "step": 3610 }, { "epoch": 0.181, "grad_norm": 1.90625, "learning_rate": 9.227448858815972e-06, "loss": 1.3818, "step": 3620 }, { "epoch": 0.1815, "grad_norm": 1.8984375, "learning_rate": 9.223241259147713e-06, "loss": 1.4222, "step": 3630 }, { "epoch": 0.182, "grad_norm": 1.8828125, "learning_rate": 9.219023197244385e-06, "loss": 1.3944, "step": 3640 }, { "epoch": 0.1825, "grad_norm": 1.9609375, "learning_rate": 9.214794683555392e-06, "loss": 1.3937, "step": 3650 }, { "epoch": 0.183, "grad_norm": 1.890625, "learning_rate": 9.21055572855603e-06, "loss": 1.3855, "step": 3660 }, { "epoch": 0.1835, "grad_norm": 1.984375, "learning_rate": 9.206306342747462e-06, "loss": 1.4042, "step": 3670 }, { "epoch": 0.184, "grad_norm": 1.7421875, "learning_rate": 9.20204653665669e-06, "loss": 1.4032, "step": 3680 }, { "epoch": 0.1845, "grad_norm": 2.046875, "learning_rate": 9.197776320836534e-06, "loss": 1.3966, "step": 3690 }, { "epoch": 0.185, "grad_norm": 2.0, "learning_rate": 9.193495705865593e-06, "loss": 1.4075, "step": 3700 }, { "epoch": 0.1855, "grad_norm": 1.8515625, "learning_rate": 9.189204702348242e-06, "loss": 1.4043, "step": 3710 }, { "epoch": 0.186, "grad_norm": 1.875, "learning_rate": 9.184903320914577e-06, "loss": 1.4014, "step": 3720 }, { "epoch": 0.1865, "grad_norm": 1.96875, "learning_rate": 9.180591572220413e-06, "loss": 1.407, "step": 3730 }, { "epoch": 0.187, "grad_norm": 2.828125, "learning_rate": 9.176269466947243e-06, "loss": 1.3924, "step": 3740 }, { "epoch": 0.1875, "grad_norm": 2.015625, "learning_rate": 9.171937015802218e-06, "loss": 1.4228, "step": 3750 }, { "epoch": 0.188, "grad_norm": 1.8203125, "learning_rate": 9.167594229518123e-06, "loss": 1.3802, "step": 3760 }, { "epoch": 0.1885, "grad_norm": 1.9609375, "learning_rate": 9.163241118853335e-06, "loss": 1.3947, "step": 3770 }, { "epoch": 0.189, "grad_norm": 1.8984375, "learning_rate": 9.158877694591818e-06, "loss": 1.4117, "step": 3780 }, { "epoch": 0.1895, "grad_norm": 1.90625, "learning_rate": 9.154503967543083e-06, "loss": 1.4189, "step": 3790 }, { "epoch": 0.19, "grad_norm": 1.7890625, "learning_rate": 9.150119948542163e-06, "loss": 1.3978, "step": 3800 }, { "epoch": 0.1905, "grad_norm": 1.84375, "learning_rate": 9.145725648449586e-06, "loss": 1.4094, "step": 3810 }, { "epoch": 0.191, "grad_norm": 2.078125, "learning_rate": 9.141321078151353e-06, "loss": 1.4086, "step": 3820 }, { "epoch": 0.1915, "grad_norm": 1.9765625, "learning_rate": 9.136906248558905e-06, "loss": 1.3956, "step": 3830 }, { "epoch": 0.192, "grad_norm": 2.921875, "learning_rate": 9.1324811706091e-06, "loss": 1.4155, "step": 3840 }, { "epoch": 0.1925, "grad_norm": 1.8984375, "learning_rate": 9.128045855264179e-06, "loss": 1.3995, "step": 3850 }, { "epoch": 0.193, "grad_norm": 1.84375, "learning_rate": 9.123600313511753e-06, "loss": 1.4133, "step": 3860 }, { "epoch": 0.1935, "grad_norm": 1.9296875, "learning_rate": 9.119144556364761e-06, "loss": 1.3983, "step": 3870 }, { "epoch": 0.194, "grad_norm": 1.984375, "learning_rate": 9.114678594861446e-06, "loss": 1.4127, "step": 3880 }, { "epoch": 0.1945, "grad_norm": 1.7109375, "learning_rate": 9.11020244006534e-06, "loss": 1.3767, "step": 3890 }, { "epoch": 0.195, "grad_norm": 1.796875, "learning_rate": 9.105716103065216e-06, "loss": 1.3987, "step": 3900 }, { "epoch": 0.1955, "grad_norm": 1.9609375, "learning_rate": 9.101219594975077e-06, "loss": 1.3911, "step": 3910 }, { "epoch": 0.196, "grad_norm": 2.03125, "learning_rate": 9.096712926934124e-06, "loss": 1.4177, "step": 3920 }, { "epoch": 0.1965, "grad_norm": 1.7578125, "learning_rate": 9.09219611010672e-06, "loss": 1.404, "step": 3930 }, { "epoch": 0.197, "grad_norm": 1.8671875, "learning_rate": 9.087669155682382e-06, "loss": 1.3977, "step": 3940 }, { "epoch": 0.1975, "grad_norm": 1.8515625, "learning_rate": 9.08313207487573e-06, "loss": 1.3895, "step": 3950 }, { "epoch": 0.198, "grad_norm": 2.234375, "learning_rate": 9.078584878926473e-06, "loss": 1.4126, "step": 3960 }, { "epoch": 0.1985, "grad_norm": 1.7734375, "learning_rate": 9.07402757909938e-06, "loss": 1.4153, "step": 3970 }, { "epoch": 0.199, "grad_norm": 2.609375, "learning_rate": 9.06946018668425e-06, "loss": 1.3959, "step": 3980 }, { "epoch": 0.1995, "grad_norm": 1.8515625, "learning_rate": 9.064882712995881e-06, "loss": 1.3992, "step": 3990 }, { "epoch": 0.2, "grad_norm": 1.921875, "learning_rate": 9.060295169374051e-06, "loss": 1.3994, "step": 4000 }, { "epoch": 0.2005, "grad_norm": 1.9375, "learning_rate": 9.05569756718348e-06, "loss": 1.3954, "step": 4010 }, { "epoch": 0.201, "grad_norm": 1.9609375, "learning_rate": 9.051089917813809e-06, "loss": 1.4065, "step": 4020 }, { "epoch": 0.2015, "grad_norm": 1.84375, "learning_rate": 9.046472232679563e-06, "loss": 1.3973, "step": 4030 }, { "epoch": 0.202, "grad_norm": 1.75, "learning_rate": 9.041844523220138e-06, "loss": 1.4095, "step": 4040 }, { "epoch": 0.2025, "grad_norm": 2.09375, "learning_rate": 9.037206800899755e-06, "loss": 1.397, "step": 4050 }, { "epoch": 0.203, "grad_norm": 2.03125, "learning_rate": 9.032559077207443e-06, "loss": 1.4029, "step": 4060 }, { "epoch": 0.2035, "grad_norm": 1.9609375, "learning_rate": 9.027901363657009e-06, "loss": 1.3951, "step": 4070 }, { "epoch": 0.204, "grad_norm": 2.0625, "learning_rate": 9.023233671787005e-06, "loss": 1.3977, "step": 4080 }, { "epoch": 0.2045, "grad_norm": 1.890625, "learning_rate": 9.018556013160703e-06, "loss": 1.4049, "step": 4090 }, { "epoch": 0.205, "grad_norm": 1.9296875, "learning_rate": 9.013868399366068e-06, "loss": 1.397, "step": 4100 }, { "epoch": 0.2055, "grad_norm": 2.0, "learning_rate": 9.009170842015722e-06, "loss": 1.4102, "step": 4110 }, { "epoch": 0.206, "grad_norm": 4.8125, "learning_rate": 9.004463352746929e-06, "loss": 1.4177, "step": 4120 }, { "epoch": 0.2065, "grad_norm": 2.15625, "learning_rate": 8.999745943221544e-06, "loss": 1.3877, "step": 4130 }, { "epoch": 0.207, "grad_norm": 1.9140625, "learning_rate": 8.995018625126011e-06, "loss": 1.3988, "step": 4140 }, { "epoch": 0.2075, "grad_norm": 1.8828125, "learning_rate": 8.990281410171311e-06, "loss": 1.3923, "step": 4150 }, { "epoch": 0.208, "grad_norm": 2.03125, "learning_rate": 8.98553431009295e-06, "loss": 1.3984, "step": 4160 }, { "epoch": 0.2085, "grad_norm": 1.9609375, "learning_rate": 8.980777336650913e-06, "loss": 1.4, "step": 4170 }, { "epoch": 0.209, "grad_norm": 2.078125, "learning_rate": 8.976010501629655e-06, "loss": 1.408, "step": 4180 }, { "epoch": 0.2095, "grad_norm": 2.140625, "learning_rate": 8.971233816838052e-06, "loss": 1.3975, "step": 4190 }, { "epoch": 0.21, "grad_norm": 2.015625, "learning_rate": 8.966447294109387e-06, "loss": 1.3865, "step": 4200 }, { "epoch": 0.2105, "grad_norm": 1.9609375, "learning_rate": 8.961650945301313e-06, "loss": 1.3986, "step": 4210 }, { "epoch": 0.211, "grad_norm": 2.15625, "learning_rate": 8.95684478229582e-06, "loss": 1.4108, "step": 4220 }, { "epoch": 0.2115, "grad_norm": 1.71875, "learning_rate": 8.952028816999219e-06, "loss": 1.3955, "step": 4230 }, { "epoch": 0.212, "grad_norm": 1.875, "learning_rate": 8.9472030613421e-06, "loss": 1.4167, "step": 4240 }, { "epoch": 0.2125, "grad_norm": 1.984375, "learning_rate": 8.942367527279305e-06, "loss": 1.3862, "step": 4250 }, { "epoch": 0.213, "grad_norm": 1.8515625, "learning_rate": 8.937522226789903e-06, "loss": 1.3849, "step": 4260 }, { "epoch": 0.2135, "grad_norm": 1.8046875, "learning_rate": 8.932667171877156e-06, "loss": 1.3931, "step": 4270 }, { "epoch": 0.214, "grad_norm": 1.796875, "learning_rate": 8.927802374568491e-06, "loss": 1.4082, "step": 4280 }, { "epoch": 0.2145, "grad_norm": 1.7265625, "learning_rate": 8.92292784691547e-06, "loss": 1.4027, "step": 4290 }, { "epoch": 0.215, "grad_norm": 1.8515625, "learning_rate": 8.918043600993762e-06, "loss": 1.4002, "step": 4300 }, { "epoch": 0.2155, "grad_norm": 1.84375, "learning_rate": 8.913149648903103e-06, "loss": 1.3878, "step": 4310 }, { "epoch": 0.216, "grad_norm": 2.015625, "learning_rate": 8.908246002767287e-06, "loss": 1.4137, "step": 4320 }, { "epoch": 0.2165, "grad_norm": 2.03125, "learning_rate": 8.90333267473411e-06, "loss": 1.4063, "step": 4330 }, { "epoch": 0.217, "grad_norm": 1.984375, "learning_rate": 8.898409676975363e-06, "loss": 1.4168, "step": 4340 }, { "epoch": 0.2175, "grad_norm": 2.21875, "learning_rate": 8.893477021686785e-06, "loss": 1.3913, "step": 4350 }, { "epoch": 0.218, "grad_norm": 1.75, "learning_rate": 8.888534721088045e-06, "loss": 1.3929, "step": 4360 }, { "epoch": 0.2185, "grad_norm": 1.765625, "learning_rate": 8.883582787422702e-06, "loss": 1.4178, "step": 4370 }, { "epoch": 0.219, "grad_norm": 2.34375, "learning_rate": 8.87862123295818e-06, "loss": 1.422, "step": 4380 }, { "epoch": 0.2195, "grad_norm": 1.96875, "learning_rate": 8.873650069985739e-06, "loss": 1.4043, "step": 4390 }, { "epoch": 0.22, "grad_norm": 1.7421875, "learning_rate": 8.86866931082044e-06, "loss": 1.3853, "step": 4400 }, { "epoch": 0.2205, "grad_norm": 1.953125, "learning_rate": 8.863678967801117e-06, "loss": 1.413, "step": 4410 }, { "epoch": 0.221, "grad_norm": 1.671875, "learning_rate": 8.858679053290346e-06, "loss": 1.3938, "step": 4420 }, { "epoch": 0.2215, "grad_norm": 1.984375, "learning_rate": 8.853669579674414e-06, "loss": 1.3869, "step": 4430 }, { "epoch": 0.222, "grad_norm": 2.0625, "learning_rate": 8.84865055936329e-06, "loss": 1.3896, "step": 4440 }, { "epoch": 0.2225, "grad_norm": 2.0, "learning_rate": 8.843622004790593e-06, "loss": 1.3821, "step": 4450 }, { "epoch": 0.223, "grad_norm": 2.046875, "learning_rate": 8.83858392841356e-06, "loss": 1.3997, "step": 4460 }, { "epoch": 0.2235, "grad_norm": 2.046875, "learning_rate": 8.833536342713016e-06, "loss": 1.4146, "step": 4470 }, { "epoch": 0.224, "grad_norm": 1.9765625, "learning_rate": 8.828479260193345e-06, "loss": 1.4166, "step": 4480 }, { "epoch": 0.2245, "grad_norm": 1.9296875, "learning_rate": 8.823412693382459e-06, "loss": 1.3898, "step": 4490 }, { "epoch": 0.225, "grad_norm": 2.109375, "learning_rate": 8.818336654831761e-06, "loss": 1.3968, "step": 4500 }, { "epoch": 0.2255, "grad_norm": 1.8671875, "learning_rate": 8.813251157116123e-06, "loss": 1.4121, "step": 4510 }, { "epoch": 0.226, "grad_norm": 1.8984375, "learning_rate": 8.808156212833844e-06, "loss": 1.405, "step": 4520 }, { "epoch": 0.2265, "grad_norm": 1.6953125, "learning_rate": 8.803051834606635e-06, "loss": 1.4188, "step": 4530 }, { "epoch": 0.227, "grad_norm": 1.9375, "learning_rate": 8.797938035079564e-06, "loss": 1.4127, "step": 4540 }, { "epoch": 0.2275, "grad_norm": 1.9140625, "learning_rate": 8.792814826921053e-06, "loss": 1.3981, "step": 4550 }, { "epoch": 0.228, "grad_norm": 1.8984375, "learning_rate": 8.78768222282282e-06, "loss": 1.3992, "step": 4560 }, { "epoch": 0.2285, "grad_norm": 1.9765625, "learning_rate": 8.782540235499866e-06, "loss": 1.397, "step": 4570 }, { "epoch": 0.229, "grad_norm": 2.09375, "learning_rate": 8.777388877690436e-06, "loss": 1.3968, "step": 4580 }, { "epoch": 0.2295, "grad_norm": 1.828125, "learning_rate": 8.772228162155988e-06, "loss": 1.4085, "step": 4590 }, { "epoch": 0.23, "grad_norm": 1.859375, "learning_rate": 8.767058101681163e-06, "loss": 1.3975, "step": 4600 }, { "epoch": 0.2305, "grad_norm": 2.109375, "learning_rate": 8.761878709073748e-06, "loss": 1.3882, "step": 4610 }, { "epoch": 0.231, "grad_norm": 1.9765625, "learning_rate": 8.756689997164655e-06, "loss": 1.4051, "step": 4620 }, { "epoch": 0.2315, "grad_norm": 2.15625, "learning_rate": 8.751491978807878e-06, "loss": 1.4115, "step": 4630 }, { "epoch": 0.232, "grad_norm": 1.8984375, "learning_rate": 8.746284666880468e-06, "loss": 1.3913, "step": 4640 }, { "epoch": 0.2325, "grad_norm": 2.0, "learning_rate": 8.741068074282498e-06, "loss": 1.3859, "step": 4650 }, { "epoch": 0.233, "grad_norm": 1.9296875, "learning_rate": 8.735842213937031e-06, "loss": 1.4062, "step": 4660 }, { "epoch": 0.2335, "grad_norm": 1.9296875, "learning_rate": 8.730607098790093e-06, "loss": 1.4113, "step": 4670 }, { "epoch": 0.234, "grad_norm": 1.875, "learning_rate": 8.72536274181063e-06, "loss": 1.4177, "step": 4680 }, { "epoch": 0.2345, "grad_norm": 2.078125, "learning_rate": 8.72010915599049e-06, "loss": 1.4028, "step": 4690 }, { "epoch": 0.235, "grad_norm": 1.8359375, "learning_rate": 8.714846354344381e-06, "loss": 1.4032, "step": 4700 }, { "epoch": 0.2355, "grad_norm": 2.109375, "learning_rate": 8.709574349909837e-06, "loss": 1.392, "step": 4710 }, { "epoch": 0.236, "grad_norm": 2.171875, "learning_rate": 8.704293155747199e-06, "loss": 1.3892, "step": 4720 }, { "epoch": 0.2365, "grad_norm": 1.9921875, "learning_rate": 8.699002784939565e-06, "loss": 1.3882, "step": 4730 }, { "epoch": 0.237, "grad_norm": 1.90625, "learning_rate": 8.693703250592772e-06, "loss": 1.3898, "step": 4740 }, { "epoch": 0.2375, "grad_norm": 1.953125, "learning_rate": 8.688394565835354e-06, "loss": 1.401, "step": 4750 }, { "epoch": 0.238, "grad_norm": 2.0, "learning_rate": 8.683076743818518e-06, "loss": 1.3826, "step": 4760 }, { "epoch": 0.2385, "grad_norm": 1.890625, "learning_rate": 8.6777497977161e-06, "loss": 1.3888, "step": 4770 }, { "epoch": 0.239, "grad_norm": 2.046875, "learning_rate": 8.672413740724545e-06, "loss": 1.387, "step": 4780 }, { "epoch": 0.2395, "grad_norm": 1.9921875, "learning_rate": 8.667068586062868e-06, "loss": 1.4012, "step": 4790 }, { "epoch": 0.24, "grad_norm": 2.03125, "learning_rate": 8.661714346972615e-06, "loss": 1.3883, "step": 4800 }, { "epoch": 0.2405, "grad_norm": 2.046875, "learning_rate": 8.65635103671785e-06, "loss": 1.3931, "step": 4810 }, { "epoch": 0.241, "grad_norm": 1.8359375, "learning_rate": 8.650978668585093e-06, "loss": 1.4025, "step": 4820 }, { "epoch": 0.2415, "grad_norm": 2.0, "learning_rate": 8.645597255883313e-06, "loss": 1.3946, "step": 4830 }, { "epoch": 0.242, "grad_norm": 2.21875, "learning_rate": 8.640206811943888e-06, "loss": 1.4117, "step": 4840 }, { "epoch": 0.2425, "grad_norm": 1.8359375, "learning_rate": 8.634807350120557e-06, "loss": 1.3897, "step": 4850 }, { "epoch": 0.243, "grad_norm": 1.8203125, "learning_rate": 8.629398883789409e-06, "loss": 1.3857, "step": 4860 }, { "epoch": 0.2435, "grad_norm": 1.90625, "learning_rate": 8.623981426348839e-06, "loss": 1.3855, "step": 4870 }, { "epoch": 0.244, "grad_norm": 1.8203125, "learning_rate": 8.61855499121951e-06, "loss": 1.3887, "step": 4880 }, { "epoch": 0.2445, "grad_norm": 1.765625, "learning_rate": 8.613119591844332e-06, "loss": 1.4041, "step": 4890 }, { "epoch": 0.245, "grad_norm": 2.140625, "learning_rate": 8.60767524168842e-06, "loss": 1.3997, "step": 4900 }, { "epoch": 0.2455, "grad_norm": 1.9140625, "learning_rate": 8.60222195423906e-06, "loss": 1.4069, "step": 4910 }, { "epoch": 0.246, "grad_norm": 1.8125, "learning_rate": 8.596759743005677e-06, "loss": 1.4059, "step": 4920 }, { "epoch": 0.2465, "grad_norm": 1.9296875, "learning_rate": 8.591288621519813e-06, "loss": 1.4049, "step": 4930 }, { "epoch": 0.247, "grad_norm": 1.8125, "learning_rate": 8.585808603335072e-06, "loss": 1.3831, "step": 4940 }, { "epoch": 0.2475, "grad_norm": 2.0625, "learning_rate": 8.580319702027104e-06, "loss": 1.4027, "step": 4950 }, { "epoch": 0.248, "grad_norm": 1.96875, "learning_rate": 8.574821931193564e-06, "loss": 1.3897, "step": 4960 }, { "epoch": 0.2485, "grad_norm": 1.9140625, "learning_rate": 8.569315304454079e-06, "loss": 1.3756, "step": 4970 }, { "epoch": 0.249, "grad_norm": 1.8828125, "learning_rate": 8.563799835450214e-06, "loss": 1.3843, "step": 4980 }, { "epoch": 0.2495, "grad_norm": 1.8515625, "learning_rate": 8.55827553784544e-06, "loss": 1.389, "step": 4990 }, { "epoch": 0.25, "grad_norm": 2.28125, "learning_rate": 8.552742425325098e-06, "loss": 1.3981, "step": 5000 }, { "epoch": 0.2505, "grad_norm": 1.8203125, "learning_rate": 8.547200511596367e-06, "loss": 1.3933, "step": 5010 }, { "epoch": 0.251, "grad_norm": 2.046875, "learning_rate": 8.541649810388232e-06, "loss": 1.3965, "step": 5020 }, { "epoch": 0.2515, "grad_norm": 2.078125, "learning_rate": 8.536090335451441e-06, "loss": 1.4005, "step": 5030 }, { "epoch": 0.252, "grad_norm": 1.875, "learning_rate": 8.530522100558482e-06, "loss": 1.3981, "step": 5040 }, { "epoch": 0.2525, "grad_norm": 2.109375, "learning_rate": 8.524945119503542e-06, "loss": 1.3927, "step": 5050 }, { "epoch": 0.253, "grad_norm": 2.765625, "learning_rate": 8.519359406102479e-06, "loss": 1.3806, "step": 5060 }, { "epoch": 0.2535, "grad_norm": 1.7578125, "learning_rate": 8.513764974192775e-06, "loss": 1.4014, "step": 5070 }, { "epoch": 0.254, "grad_norm": 2.078125, "learning_rate": 8.508161837633517e-06, "loss": 1.4147, "step": 5080 }, { "epoch": 0.2545, "grad_norm": 1.9609375, "learning_rate": 8.502550010305357e-06, "loss": 1.3971, "step": 5090 }, { "epoch": 0.255, "grad_norm": 2.0, "learning_rate": 8.496929506110473e-06, "loss": 1.3988, "step": 5100 }, { "epoch": 0.2555, "grad_norm": 1.859375, "learning_rate": 8.491300338972537e-06, "loss": 1.3918, "step": 5110 }, { "epoch": 0.256, "grad_norm": 1.9375, "learning_rate": 8.485662522836687e-06, "loss": 1.4069, "step": 5120 }, { "epoch": 0.2565, "grad_norm": 2.0, "learning_rate": 8.480016071669483e-06, "loss": 1.3951, "step": 5130 }, { "epoch": 0.257, "grad_norm": 1.9296875, "learning_rate": 8.47436099945888e-06, "loss": 1.4024, "step": 5140 }, { "epoch": 0.2575, "grad_norm": 1.9140625, "learning_rate": 8.468697320214183e-06, "loss": 1.3934, "step": 5150 }, { "epoch": 0.258, "grad_norm": 1.9765625, "learning_rate": 8.463025047966028e-06, "loss": 1.4075, "step": 5160 }, { "epoch": 0.2585, "grad_norm": 1.90625, "learning_rate": 8.457344196766331e-06, "loss": 1.3935, "step": 5170 }, { "epoch": 0.259, "grad_norm": 2.15625, "learning_rate": 8.451654780688268e-06, "loss": 1.3865, "step": 5180 }, { "epoch": 0.2595, "grad_norm": 1.6953125, "learning_rate": 8.445956813826226e-06, "loss": 1.3828, "step": 5190 }, { "epoch": 0.26, "grad_norm": 1.96875, "learning_rate": 8.44025031029578e-06, "loss": 1.407, "step": 5200 }, { "epoch": 0.2605, "grad_norm": 1.8359375, "learning_rate": 8.434535284233645e-06, "loss": 1.4003, "step": 5210 }, { "epoch": 0.261, "grad_norm": 1.8984375, "learning_rate": 8.428811749797663e-06, "loss": 1.4106, "step": 5220 }, { "epoch": 0.2615, "grad_norm": 1.8359375, "learning_rate": 8.423079721166736e-06, "loss": 1.3986, "step": 5230 }, { "epoch": 0.262, "grad_norm": 2.046875, "learning_rate": 8.417339212540826e-06, "loss": 1.4024, "step": 5240 }, { "epoch": 0.2625, "grad_norm": 2.28125, "learning_rate": 8.411590238140887e-06, "loss": 1.3934, "step": 5250 }, { "epoch": 0.263, "grad_norm": 1.8828125, "learning_rate": 8.40583281220886e-06, "loss": 1.3954, "step": 5260 }, { "epoch": 0.2635, "grad_norm": 2.0, "learning_rate": 8.400066949007611e-06, "loss": 1.3816, "step": 5270 }, { "epoch": 0.264, "grad_norm": 1.921875, "learning_rate": 8.394292662820911e-06, "loss": 1.387, "step": 5280 }, { "epoch": 0.2645, "grad_norm": 2.03125, "learning_rate": 8.388509967953401e-06, "loss": 1.402, "step": 5290 }, { "epoch": 0.265, "grad_norm": 2.015625, "learning_rate": 8.382718878730554e-06, "loss": 1.3939, "step": 5300 }, { "epoch": 0.2655, "grad_norm": 2.03125, "learning_rate": 8.376919409498627e-06, "loss": 1.3958, "step": 5310 }, { "epoch": 0.266, "grad_norm": 1.8125, "learning_rate": 8.371111574624648e-06, "loss": 1.3847, "step": 5320 }, { "epoch": 0.2665, "grad_norm": 1.8984375, "learning_rate": 8.365295388496366e-06, "loss": 1.3841, "step": 5330 }, { "epoch": 0.267, "grad_norm": 1.8203125, "learning_rate": 8.359470865522216e-06, "loss": 1.3861, "step": 5340 }, { "epoch": 0.2675, "grad_norm": 2.140625, "learning_rate": 8.353638020131293e-06, "loss": 1.3947, "step": 5350 }, { "epoch": 0.268, "grad_norm": 1.9921875, "learning_rate": 8.347796866773298e-06, "loss": 1.3915, "step": 5360 }, { "epoch": 0.2685, "grad_norm": 2.015625, "learning_rate": 8.341947419918526e-06, "loss": 1.3972, "step": 5370 }, { "epoch": 0.269, "grad_norm": 2.46875, "learning_rate": 8.336089694057806e-06, "loss": 1.4064, "step": 5380 }, { "epoch": 0.2695, "grad_norm": 1.8203125, "learning_rate": 8.330223703702482e-06, "loss": 1.3843, "step": 5390 }, { "epoch": 0.27, "grad_norm": 1.8671875, "learning_rate": 8.324349463384373e-06, "loss": 1.4135, "step": 5400 }, { "epoch": 0.2705, "grad_norm": 2.25, "learning_rate": 8.318466987655735e-06, "loss": 1.3823, "step": 5410 }, { "epoch": 0.271, "grad_norm": 1.8828125, "learning_rate": 8.31257629108922e-06, "loss": 1.3834, "step": 5420 }, { "epoch": 0.2715, "grad_norm": 1.953125, "learning_rate": 8.306677388277855e-06, "loss": 1.409, "step": 5430 }, { "epoch": 0.272, "grad_norm": 2.078125, "learning_rate": 8.300770293834987e-06, "loss": 1.3822, "step": 5440 }, { "epoch": 0.2725, "grad_norm": 2.15625, "learning_rate": 8.294855022394263e-06, "loss": 1.4069, "step": 5450 }, { "epoch": 0.273, "grad_norm": 2.28125, "learning_rate": 8.288931588609583e-06, "loss": 1.3751, "step": 5460 }, { "epoch": 0.2735, "grad_norm": 1.9765625, "learning_rate": 8.283000007155069e-06, "loss": 1.4046, "step": 5470 }, { "epoch": 0.274, "grad_norm": 3.96875, "learning_rate": 8.277060292725028e-06, "loss": 1.3898, "step": 5480 }, { "epoch": 0.2745, "grad_norm": 1.71875, "learning_rate": 8.271112460033913e-06, "loss": 1.3733, "step": 5490 }, { "epoch": 0.275, "grad_norm": 1.9296875, "learning_rate": 8.265156523816287e-06, "loss": 1.3973, "step": 5500 }, { "epoch": 0.2755, "grad_norm": 2.03125, "learning_rate": 8.259192498826795e-06, "loss": 1.4072, "step": 5510 }, { "epoch": 0.276, "grad_norm": 1.9765625, "learning_rate": 8.253220399840111e-06, "loss": 1.3954, "step": 5520 }, { "epoch": 0.2765, "grad_norm": 4.78125, "learning_rate": 8.247240241650918e-06, "loss": 1.3914, "step": 5530 }, { "epoch": 0.277, "grad_norm": 1.828125, "learning_rate": 8.24125203907386e-06, "loss": 1.3923, "step": 5540 }, { "epoch": 0.2775, "grad_norm": 2.015625, "learning_rate": 8.235255806943512e-06, "loss": 1.4034, "step": 5550 }, { "epoch": 0.278, "grad_norm": 1.84375, "learning_rate": 8.229251560114335e-06, "loss": 1.4031, "step": 5560 }, { "epoch": 0.2785, "grad_norm": 1.8359375, "learning_rate": 8.223239313460654e-06, "loss": 1.3844, "step": 5570 }, { "epoch": 0.279, "grad_norm": 2.125, "learning_rate": 8.217219081876604e-06, "loss": 1.396, "step": 5580 }, { "epoch": 0.2795, "grad_norm": 1.9140625, "learning_rate": 8.211190880276103e-06, "loss": 1.4066, "step": 5590 }, { "epoch": 0.28, "grad_norm": 2.046875, "learning_rate": 8.205154723592818e-06, "loss": 1.4016, "step": 5600 }, { "epoch": 0.2805, "grad_norm": 1.75, "learning_rate": 8.199110626780115e-06, "loss": 1.3875, "step": 5610 }, { "epoch": 0.281, "grad_norm": 1.765625, "learning_rate": 8.193058604811038e-06, "loss": 1.3912, "step": 5620 }, { "epoch": 0.2815, "grad_norm": 2.125, "learning_rate": 8.186998672678257e-06, "loss": 1.3953, "step": 5630 }, { "epoch": 0.282, "grad_norm": 1.828125, "learning_rate": 8.180930845394042e-06, "loss": 1.3753, "step": 5640 }, { "epoch": 0.2825, "grad_norm": 2.109375, "learning_rate": 8.174855137990224e-06, "loss": 1.384, "step": 5650 }, { "epoch": 0.283, "grad_norm": 1.75, "learning_rate": 8.168771565518148e-06, "loss": 1.3947, "step": 5660 }, { "epoch": 0.2835, "grad_norm": 1.84375, "learning_rate": 8.16268014304865e-06, "loss": 1.3803, "step": 5670 }, { "epoch": 0.284, "grad_norm": 2.125, "learning_rate": 8.156580885672008e-06, "loss": 1.3943, "step": 5680 }, { "epoch": 0.2845, "grad_norm": 2.53125, "learning_rate": 8.150473808497916e-06, "loss": 1.4005, "step": 5690 }, { "epoch": 0.285, "grad_norm": 2.28125, "learning_rate": 8.14435892665543e-06, "loss": 1.42, "step": 5700 }, { "epoch": 0.2855, "grad_norm": 1.9609375, "learning_rate": 8.138236255292948e-06, "loss": 1.3973, "step": 5710 }, { "epoch": 0.286, "grad_norm": 2.125, "learning_rate": 8.132105809578163e-06, "loss": 1.3869, "step": 5720 }, { "epoch": 0.2865, "grad_norm": 1.765625, "learning_rate": 8.12596760469803e-06, "loss": 1.3758, "step": 5730 }, { "epoch": 0.287, "grad_norm": 1.7265625, "learning_rate": 8.119821655858721e-06, "loss": 1.4025, "step": 5740 }, { "epoch": 0.2875, "grad_norm": 1.9765625, "learning_rate": 8.113667978285593e-06, "loss": 1.376, "step": 5750 }, { "epoch": 0.288, "grad_norm": 2.90625, "learning_rate": 8.107506587223153e-06, "loss": 1.3854, "step": 5760 }, { "epoch": 0.2885, "grad_norm": 1.90625, "learning_rate": 8.101337497935014e-06, "loss": 1.4105, "step": 5770 }, { "epoch": 0.289, "grad_norm": 1.96875, "learning_rate": 8.095160725703859e-06, "loss": 1.4089, "step": 5780 }, { "epoch": 0.2895, "grad_norm": 2.171875, "learning_rate": 8.088976285831406e-06, "loss": 1.3973, "step": 5790 }, { "epoch": 0.29, "grad_norm": 1.8515625, "learning_rate": 8.082784193638369e-06, "loss": 1.4075, "step": 5800 }, { "epoch": 0.2905, "grad_norm": 1.90625, "learning_rate": 8.076584464464412e-06, "loss": 1.3939, "step": 5810 }, { "epoch": 0.291, "grad_norm": 1.7578125, "learning_rate": 8.070377113668129e-06, "loss": 1.38, "step": 5820 }, { "epoch": 0.2915, "grad_norm": 1.9375, "learning_rate": 8.064162156626988e-06, "loss": 1.3815, "step": 5830 }, { "epoch": 0.292, "grad_norm": 1.875, "learning_rate": 8.057939608737297e-06, "loss": 1.384, "step": 5840 }, { "epoch": 0.2925, "grad_norm": 2.0, "learning_rate": 8.051709485414178e-06, "loss": 1.3957, "step": 5850 }, { "epoch": 0.293, "grad_norm": 2.078125, "learning_rate": 8.045471802091512e-06, "loss": 1.3898, "step": 5860 }, { "epoch": 0.2935, "grad_norm": 1.8359375, "learning_rate": 8.039226574221914e-06, "loss": 1.3818, "step": 5870 }, { "epoch": 0.294, "grad_norm": 2.046875, "learning_rate": 8.032973817276684e-06, "loss": 1.3669, "step": 5880 }, { "epoch": 0.2945, "grad_norm": 1.828125, "learning_rate": 8.026713546745777e-06, "loss": 1.3882, "step": 5890 }, { "epoch": 0.295, "grad_norm": 1.8671875, "learning_rate": 8.020445778137759e-06, "loss": 1.4092, "step": 5900 }, { "epoch": 0.2955, "grad_norm": 2.171875, "learning_rate": 8.014170526979776e-06, "loss": 1.3997, "step": 5910 }, { "epoch": 0.296, "grad_norm": 1.9921875, "learning_rate": 8.007887808817504e-06, "loss": 1.3901, "step": 5920 }, { "epoch": 0.2965, "grad_norm": 1.984375, "learning_rate": 8.00159763921512e-06, "loss": 1.3965, "step": 5930 }, { "epoch": 0.297, "grad_norm": 2.0, "learning_rate": 7.99530003375526e-06, "loss": 1.3791, "step": 5940 }, { "epoch": 0.2975, "grad_norm": 1.8515625, "learning_rate": 7.988995008038982e-06, "loss": 1.3651, "step": 5950 }, { "epoch": 0.298, "grad_norm": 1.90625, "learning_rate": 7.982682577685728e-06, "loss": 1.4061, "step": 5960 }, { "epoch": 0.2985, "grad_norm": 2.015625, "learning_rate": 7.976362758333276e-06, "loss": 1.3936, "step": 5970 }, { "epoch": 0.299, "grad_norm": 1.8515625, "learning_rate": 7.970035565637717e-06, "loss": 1.3885, "step": 5980 }, { "epoch": 0.2995, "grad_norm": 2.640625, "learning_rate": 7.963701015273404e-06, "loss": 1.3913, "step": 5990 }, { "epoch": 0.3, "grad_norm": 1.953125, "learning_rate": 7.957359122932918e-06, "loss": 1.3858, "step": 6000 }, { "epoch": 0.3005, "grad_norm": 3.796875, "learning_rate": 7.95100990432703e-06, "loss": 1.3831, "step": 6010 }, { "epoch": 0.301, "grad_norm": 1.953125, "learning_rate": 7.944653375184653e-06, "loss": 1.3805, "step": 6020 }, { "epoch": 0.3015, "grad_norm": 2.078125, "learning_rate": 7.938289551252823e-06, "loss": 1.4066, "step": 6030 }, { "epoch": 0.302, "grad_norm": 1.8671875, "learning_rate": 7.931918448296635e-06, "loss": 1.3692, "step": 6040 }, { "epoch": 0.3025, "grad_norm": 1.75, "learning_rate": 7.92554008209922e-06, "loss": 1.3845, "step": 6050 }, { "epoch": 0.303, "grad_norm": 1.96875, "learning_rate": 7.919154468461712e-06, "loss": 1.3837, "step": 6060 }, { "epoch": 0.3035, "grad_norm": 2.109375, "learning_rate": 7.912761623203185e-06, "loss": 1.384, "step": 6070 }, { "epoch": 0.304, "grad_norm": 2.0, "learning_rate": 7.906361562160633e-06, "loss": 1.3889, "step": 6080 }, { "epoch": 0.3045, "grad_norm": 1.9140625, "learning_rate": 7.899954301188929e-06, "loss": 1.3851, "step": 6090 }, { "epoch": 0.305, "grad_norm": 1.9375, "learning_rate": 7.89353985616078e-06, "loss": 1.3948, "step": 6100 }, { "epoch": 0.3055, "grad_norm": 1.8046875, "learning_rate": 7.887118242966688e-06, "loss": 1.3869, "step": 6110 }, { "epoch": 0.306, "grad_norm": 2.4375, "learning_rate": 7.880689477514916e-06, "loss": 1.3749, "step": 6120 }, { "epoch": 0.3065, "grad_norm": 2.25, "learning_rate": 7.874253575731443e-06, "loss": 1.3857, "step": 6130 }, { "epoch": 0.307, "grad_norm": 1.90625, "learning_rate": 7.86781055355993e-06, "loss": 1.3973, "step": 6140 }, { "epoch": 0.3075, "grad_norm": 1.8984375, "learning_rate": 7.861360426961671e-06, "loss": 1.395, "step": 6150 }, { "epoch": 0.308, "grad_norm": 2.046875, "learning_rate": 7.854903211915568e-06, "loss": 1.3911, "step": 6160 }, { "epoch": 0.3085, "grad_norm": 1.7890625, "learning_rate": 7.848438924418075e-06, "loss": 1.3951, "step": 6170 }, { "epoch": 0.309, "grad_norm": 1.8828125, "learning_rate": 7.841967580483175e-06, "loss": 1.3883, "step": 6180 }, { "epoch": 0.3095, "grad_norm": 2.015625, "learning_rate": 7.835489196142325e-06, "loss": 1.3948, "step": 6190 }, { "epoch": 0.31, "grad_norm": 2.21875, "learning_rate": 7.829003787444427e-06, "loss": 1.3953, "step": 6200 }, { "epoch": 0.3105, "grad_norm": 1.828125, "learning_rate": 7.82251137045578e-06, "loss": 1.3483, "step": 6210 }, { "epoch": 0.311, "grad_norm": 1.7109375, "learning_rate": 7.816011961260051e-06, "loss": 1.3977, "step": 6220 }, { "epoch": 0.3115, "grad_norm": 2.203125, "learning_rate": 7.809505575958225e-06, "loss": 1.3701, "step": 6230 }, { "epoch": 0.312, "grad_norm": 1.9375, "learning_rate": 7.802992230668572e-06, "loss": 1.3701, "step": 6240 }, { "epoch": 0.3125, "grad_norm": 1.90625, "learning_rate": 7.796471941526596e-06, "loss": 1.3964, "step": 6250 }, { "epoch": 0.313, "grad_norm": 1.828125, "learning_rate": 7.789944724685013e-06, "loss": 1.3841, "step": 6260 }, { "epoch": 0.3135, "grad_norm": 1.9453125, "learning_rate": 7.783410596313692e-06, "loss": 1.3953, "step": 6270 }, { "epoch": 0.314, "grad_norm": 1.9140625, "learning_rate": 7.776869572599634e-06, "loss": 1.375, "step": 6280 }, { "epoch": 0.3145, "grad_norm": 2.140625, "learning_rate": 7.770321669746912e-06, "loss": 1.4022, "step": 6290 }, { "epoch": 0.315, "grad_norm": 1.9140625, "learning_rate": 7.763766903976648e-06, "loss": 1.3615, "step": 6300 }, { "epoch": 0.3155, "grad_norm": 2.296875, "learning_rate": 7.757205291526961e-06, "loss": 1.3948, "step": 6310 }, { "epoch": 0.316, "grad_norm": 1.9609375, "learning_rate": 7.750636848652933e-06, "loss": 1.3908, "step": 6320 }, { "epoch": 0.3165, "grad_norm": 2.03125, "learning_rate": 7.744061591626563e-06, "loss": 1.3792, "step": 6330 }, { "epoch": 0.317, "grad_norm": 1.953125, "learning_rate": 7.737479536736743e-06, "loss": 1.3736, "step": 6340 }, { "epoch": 0.3175, "grad_norm": 1.7890625, "learning_rate": 7.730890700289189e-06, "loss": 1.3834, "step": 6350 }, { "epoch": 0.318, "grad_norm": 1.890625, "learning_rate": 7.724295098606429e-06, "loss": 1.3826, "step": 6360 }, { "epoch": 0.3185, "grad_norm": 1.953125, "learning_rate": 7.717692748027742e-06, "loss": 1.3797, "step": 6370 }, { "epoch": 0.319, "grad_norm": 2.234375, "learning_rate": 7.711083664909137e-06, "loss": 1.4009, "step": 6380 }, { "epoch": 0.3195, "grad_norm": 2.078125, "learning_rate": 7.704467865623288e-06, "loss": 1.3901, "step": 6390 }, { "epoch": 0.32, "grad_norm": 1.9375, "learning_rate": 7.69784536655952e-06, "loss": 1.3944, "step": 6400 }, { "epoch": 0.3205, "grad_norm": 1.84375, "learning_rate": 7.691216184123742e-06, "loss": 1.3956, "step": 6410 }, { "epoch": 0.321, "grad_norm": 2.0, "learning_rate": 7.684580334738433e-06, "loss": 1.4027, "step": 6420 }, { "epoch": 0.3215, "grad_norm": 2.078125, "learning_rate": 7.677937834842579e-06, "loss": 1.3898, "step": 6430 }, { "epoch": 0.322, "grad_norm": 1.8984375, "learning_rate": 7.671288700891643e-06, "loss": 1.3935, "step": 6440 }, { "epoch": 0.3225, "grad_norm": 1.9609375, "learning_rate": 7.66463294935753e-06, "loss": 1.3862, "step": 6450 }, { "epoch": 0.323, "grad_norm": 1.7421875, "learning_rate": 7.657970596728526e-06, "loss": 1.3915, "step": 6460 }, { "epoch": 0.3235, "grad_norm": 2.046875, "learning_rate": 7.651301659509275e-06, "loss": 1.3929, "step": 6470 }, { "epoch": 0.324, "grad_norm": 1.921875, "learning_rate": 7.644626154220743e-06, "loss": 1.3731, "step": 6480 }, { "epoch": 0.3245, "grad_norm": 2.203125, "learning_rate": 7.63794409740015e-06, "loss": 1.4082, "step": 6490 }, { "epoch": 0.325, "grad_norm": 1.8671875, "learning_rate": 7.631255505600959e-06, "loss": 1.3758, "step": 6500 }, { "epoch": 0.3255, "grad_norm": 2.1875, "learning_rate": 7.624560395392815e-06, "loss": 1.3872, "step": 6510 }, { "epoch": 0.326, "grad_norm": 1.8359375, "learning_rate": 7.617858783361518e-06, "loss": 1.3735, "step": 6520 }, { "epoch": 0.3265, "grad_norm": 1.9375, "learning_rate": 7.611150686108966e-06, "loss": 1.4021, "step": 6530 }, { "epoch": 0.327, "grad_norm": 1.8828125, "learning_rate": 7.604436120253129e-06, "loss": 1.3938, "step": 6540 }, { "epoch": 0.3275, "grad_norm": 1.9765625, "learning_rate": 7.597715102428002e-06, "loss": 1.3774, "step": 6550 }, { "epoch": 0.328, "grad_norm": 2.046875, "learning_rate": 7.590987649283561e-06, "loss": 1.379, "step": 6560 }, { "epoch": 0.3285, "grad_norm": 1.9765625, "learning_rate": 7.584253777485722e-06, "loss": 1.3985, "step": 6570 }, { "epoch": 0.329, "grad_norm": 2.078125, "learning_rate": 7.577513503716308e-06, "loss": 1.3808, "step": 6580 }, { "epoch": 0.3295, "grad_norm": 1.8359375, "learning_rate": 7.570766844672997e-06, "loss": 1.4061, "step": 6590 }, { "epoch": 0.33, "grad_norm": 1.7578125, "learning_rate": 7.5640138170692885e-06, "loss": 1.3854, "step": 6600 }, { "epoch": 0.3305, "grad_norm": 1.828125, "learning_rate": 7.557254437634453e-06, "loss": 1.3835, "step": 6610 }, { "epoch": 0.331, "grad_norm": 1.9453125, "learning_rate": 7.550488723113506e-06, "loss": 1.3823, "step": 6620 }, { "epoch": 0.3315, "grad_norm": 1.890625, "learning_rate": 7.543716690267145e-06, "loss": 1.3964, "step": 6630 }, { "epoch": 0.332, "grad_norm": 1.8828125, "learning_rate": 7.5369383558717325e-06, "loss": 1.3893, "step": 6640 }, { "epoch": 0.3325, "grad_norm": 2.109375, "learning_rate": 7.53015373671923e-06, "loss": 1.3873, "step": 6650 }, { "epoch": 0.333, "grad_norm": 2.484375, "learning_rate": 7.523362849617177e-06, "loss": 1.3704, "step": 6660 }, { "epoch": 0.3335, "grad_norm": 1.875, "learning_rate": 7.516565711388635e-06, "loss": 1.3764, "step": 6670 }, { "epoch": 0.334, "grad_norm": 1.9375, "learning_rate": 7.509762338872158e-06, "loss": 1.3848, "step": 6680 }, { "epoch": 0.3345, "grad_norm": 1.8515625, "learning_rate": 7.502952748921734e-06, "loss": 1.3859, "step": 6690 }, { "epoch": 0.335, "grad_norm": 2.09375, "learning_rate": 7.4961369584067636e-06, "loss": 1.3715, "step": 6700 }, { "epoch": 0.3355, "grad_norm": 2.015625, "learning_rate": 7.489314984212003e-06, "loss": 1.3778, "step": 6710 }, { "epoch": 0.336, "grad_norm": 2.09375, "learning_rate": 7.482486843237526e-06, "loss": 1.3884, "step": 6720 }, { "epoch": 0.3365, "grad_norm": 1.90625, "learning_rate": 7.475652552398689e-06, "loss": 1.4023, "step": 6730 }, { "epoch": 0.337, "grad_norm": 2.0625, "learning_rate": 7.468812128626079e-06, "loss": 1.3808, "step": 6740 }, { "epoch": 0.3375, "grad_norm": 1.8984375, "learning_rate": 7.461965588865474e-06, "loss": 1.3847, "step": 6750 }, { "epoch": 0.338, "grad_norm": 1.84375, "learning_rate": 7.455112950077807e-06, "loss": 1.3907, "step": 6760 }, { "epoch": 0.3385, "grad_norm": 1.9140625, "learning_rate": 7.448254229239123e-06, "loss": 1.411, "step": 6770 }, { "epoch": 0.339, "grad_norm": 2.015625, "learning_rate": 7.441389443340525e-06, "loss": 1.3975, "step": 6780 }, { "epoch": 0.3395, "grad_norm": 1.875, "learning_rate": 7.434518609388153e-06, "loss": 1.3763, "step": 6790 }, { "epoch": 0.34, "grad_norm": 2.03125, "learning_rate": 7.427641744403115e-06, "loss": 1.4071, "step": 6800 }, { "epoch": 0.3405, "grad_norm": 1.9140625, "learning_rate": 7.420758865421475e-06, "loss": 1.3797, "step": 6810 }, { "epoch": 0.341, "grad_norm": 2.046875, "learning_rate": 7.413869989494183e-06, "loss": 1.3629, "step": 6820 }, { "epoch": 0.3415, "grad_norm": 1.953125, "learning_rate": 7.4069751336870546e-06, "loss": 1.3738, "step": 6830 }, { "epoch": 0.342, "grad_norm": 1.890625, "learning_rate": 7.400074315080711e-06, "loss": 1.3879, "step": 6840 }, { "epoch": 0.3425, "grad_norm": 2.0625, "learning_rate": 7.393167550770554e-06, "loss": 1.3968, "step": 6850 }, { "epoch": 0.343, "grad_norm": 1.953125, "learning_rate": 7.386254857866707e-06, "loss": 1.3894, "step": 6860 }, { "epoch": 0.3435, "grad_norm": 1.9453125, "learning_rate": 7.379336253493984e-06, "loss": 1.3871, "step": 6870 }, { "epoch": 0.344, "grad_norm": 1.890625, "learning_rate": 7.372411754791842e-06, "loss": 1.3801, "step": 6880 }, { "epoch": 0.3445, "grad_norm": 2.078125, "learning_rate": 7.365481378914343e-06, "loss": 1.3842, "step": 6890 }, { "epoch": 0.345, "grad_norm": 2.765625, "learning_rate": 7.3585451430301034e-06, "loss": 1.3629, "step": 6900 }, { "epoch": 0.3455, "grad_norm": 2.09375, "learning_rate": 7.351603064322263e-06, "loss": 1.3829, "step": 6910 }, { "epoch": 0.346, "grad_norm": 1.921875, "learning_rate": 7.34465515998843e-06, "loss": 1.3876, "step": 6920 }, { "epoch": 0.3465, "grad_norm": 2.046875, "learning_rate": 7.337701447240647e-06, "loss": 1.3856, "step": 6930 }, { "epoch": 0.347, "grad_norm": 1.7421875, "learning_rate": 7.330741943305346e-06, "loss": 1.3844, "step": 6940 }, { "epoch": 0.3475, "grad_norm": 2.015625, "learning_rate": 7.323776665423308e-06, "loss": 1.3827, "step": 6950 }, { "epoch": 0.348, "grad_norm": 1.9921875, "learning_rate": 7.316805630849609e-06, "loss": 1.3818, "step": 6960 }, { "epoch": 0.3485, "grad_norm": 1.8984375, "learning_rate": 7.3098288568536e-06, "loss": 1.3632, "step": 6970 }, { "epoch": 0.349, "grad_norm": 1.84375, "learning_rate": 7.3028463607188364e-06, "loss": 1.4126, "step": 6980 }, { "epoch": 0.3495, "grad_norm": 1.9609375, "learning_rate": 7.2958581597430565e-06, "loss": 1.3745, "step": 6990 }, { "epoch": 0.35, "grad_norm": 2.0, "learning_rate": 7.28886427123813e-06, "loss": 1.388, "step": 7000 }, { "epoch": 0.3505, "grad_norm": 2.046875, "learning_rate": 7.281864712530018e-06, "loss": 1.4059, "step": 7010 }, { "epoch": 0.351, "grad_norm": 2.046875, "learning_rate": 7.274859500958722e-06, "loss": 1.381, "step": 7020 }, { "epoch": 0.3515, "grad_norm": 2.0, "learning_rate": 7.267848653878257e-06, "loss": 1.3922, "step": 7030 }, { "epoch": 0.352, "grad_norm": 1.703125, "learning_rate": 7.260832188656591e-06, "loss": 1.3903, "step": 7040 }, { "epoch": 0.3525, "grad_norm": 1.9453125, "learning_rate": 7.25381012267561e-06, "loss": 1.3846, "step": 7050 }, { "epoch": 0.353, "grad_norm": 2.09375, "learning_rate": 7.246782473331082e-06, "loss": 1.3952, "step": 7060 }, { "epoch": 0.3535, "grad_norm": 1.96875, "learning_rate": 7.2397492580326e-06, "loss": 1.3961, "step": 7070 }, { "epoch": 0.354, "grad_norm": 1.9921875, "learning_rate": 7.232710494203544e-06, "loss": 1.3977, "step": 7080 }, { "epoch": 0.3545, "grad_norm": 1.859375, "learning_rate": 7.22566619928105e-06, "loss": 1.3711, "step": 7090 }, { "epoch": 0.355, "grad_norm": 1.859375, "learning_rate": 7.218616390715942e-06, "loss": 1.3951, "step": 7100 }, { "epoch": 0.3555, "grad_norm": 2.28125, "learning_rate": 7.211561085972714e-06, "loss": 1.3845, "step": 7110 }, { "epoch": 0.356, "grad_norm": 1.796875, "learning_rate": 7.20450030252947e-06, "loss": 1.3873, "step": 7120 }, { "epoch": 0.3565, "grad_norm": 1.90625, "learning_rate": 7.1974340578778905e-06, "loss": 1.3852, "step": 7130 }, { "epoch": 0.357, "grad_norm": 2.1875, "learning_rate": 7.190362369523176e-06, "loss": 1.3893, "step": 7140 }, { "epoch": 0.3575, "grad_norm": 1.953125, "learning_rate": 7.183285254984027e-06, "loss": 1.387, "step": 7150 }, { "epoch": 0.358, "grad_norm": 2.765625, "learning_rate": 7.1762027317925745e-06, "loss": 1.3859, "step": 7160 }, { "epoch": 0.3585, "grad_norm": 2.09375, "learning_rate": 7.1691148174943525e-06, "loss": 1.3891, "step": 7170 }, { "epoch": 0.359, "grad_norm": 1.7421875, "learning_rate": 7.162021529648251e-06, "loss": 1.3853, "step": 7180 }, { "epoch": 0.3595, "grad_norm": 2.171875, "learning_rate": 7.154922885826472e-06, "loss": 1.4047, "step": 7190 }, { "epoch": 0.36, "grad_norm": 1.921875, "learning_rate": 7.147818903614482e-06, "loss": 1.3866, "step": 7200 }, { "epoch": 0.3605, "grad_norm": 1.9375, "learning_rate": 7.140709600610979e-06, "loss": 1.389, "step": 7210 }, { "epoch": 0.361, "grad_norm": 2.265625, "learning_rate": 7.133594994427835e-06, "loss": 1.3885, "step": 7220 }, { "epoch": 0.3615, "grad_norm": 2.1875, "learning_rate": 7.126475102690065e-06, "loss": 1.3803, "step": 7230 }, { "epoch": 0.362, "grad_norm": 2.96875, "learning_rate": 7.119349943035776e-06, "loss": 1.3752, "step": 7240 }, { "epoch": 0.3625, "grad_norm": 2.015625, "learning_rate": 7.1122195331161256e-06, "loss": 1.3767, "step": 7250 }, { "epoch": 0.363, "grad_norm": 2.09375, "learning_rate": 7.105083890595276e-06, "loss": 1.3687, "step": 7260 }, { "epoch": 0.3635, "grad_norm": 2.15625, "learning_rate": 7.097943033150354e-06, "loss": 1.3872, "step": 7270 }, { "epoch": 0.364, "grad_norm": 1.8515625, "learning_rate": 7.090796978471406e-06, "loss": 1.3776, "step": 7280 }, { "epoch": 0.3645, "grad_norm": 2.8125, "learning_rate": 7.083645744261351e-06, "loss": 1.3991, "step": 7290 }, { "epoch": 0.365, "grad_norm": 1.890625, "learning_rate": 7.076489348235941e-06, "loss": 1.3878, "step": 7300 }, { "epoch": 0.3655, "grad_norm": 1.9453125, "learning_rate": 7.069327808123716e-06, "loss": 1.3807, "step": 7310 }, { "epoch": 0.366, "grad_norm": 2.0, "learning_rate": 7.062161141665957e-06, "loss": 1.3991, "step": 7320 }, { "epoch": 0.3665, "grad_norm": 2.078125, "learning_rate": 7.054989366616647e-06, "loss": 1.3865, "step": 7330 }, { "epoch": 0.367, "grad_norm": 1.96875, "learning_rate": 7.047812500742425e-06, "loss": 1.3896, "step": 7340 }, { "epoch": 0.3675, "grad_norm": 1.8984375, "learning_rate": 7.040630561822534e-06, "loss": 1.3849, "step": 7350 }, { "epoch": 0.368, "grad_norm": 1.9375, "learning_rate": 7.033443567648799e-06, "loss": 1.3782, "step": 7360 }, { "epoch": 0.3685, "grad_norm": 2.09375, "learning_rate": 7.026251536025552e-06, "loss": 1.4036, "step": 7370 }, { "epoch": 0.369, "grad_norm": 2.109375, "learning_rate": 7.0190544847696174e-06, "loss": 1.3993, "step": 7380 }, { "epoch": 0.3695, "grad_norm": 1.84375, "learning_rate": 7.011852431710247e-06, "loss": 1.3732, "step": 7390 }, { "epoch": 0.37, "grad_norm": 2.0625, "learning_rate": 7.004645394689087e-06, "loss": 1.4014, "step": 7400 }, { "epoch": 0.3705, "grad_norm": 2.109375, "learning_rate": 6.997433391560127e-06, "loss": 1.3707, "step": 7410 }, { "epoch": 0.371, "grad_norm": 1.8984375, "learning_rate": 6.990216440189664e-06, "loss": 1.3907, "step": 7420 }, { "epoch": 0.3715, "grad_norm": 2.109375, "learning_rate": 6.982994558456251e-06, "loss": 1.387, "step": 7430 }, { "epoch": 0.372, "grad_norm": 2.03125, "learning_rate": 6.975767764250654e-06, "loss": 1.3887, "step": 7440 }, { "epoch": 0.3725, "grad_norm": 1.703125, "learning_rate": 6.968536075475808e-06, "loss": 1.3776, "step": 7450 }, { "epoch": 0.373, "grad_norm": 2.109375, "learning_rate": 6.961299510046777e-06, "loss": 1.3829, "step": 7460 }, { "epoch": 0.3735, "grad_norm": 1.765625, "learning_rate": 6.954058085890703e-06, "loss": 1.3887, "step": 7470 }, { "epoch": 0.374, "grad_norm": 1.8515625, "learning_rate": 6.946811820946764e-06, "loss": 1.3897, "step": 7480 }, { "epoch": 0.3745, "grad_norm": 1.921875, "learning_rate": 6.939560733166133e-06, "loss": 1.3901, "step": 7490 }, { "epoch": 0.375, "grad_norm": 2.0, "learning_rate": 6.9323048405119275e-06, "loss": 1.4046, "step": 7500 }, { "epoch": 0.3755, "grad_norm": 2.515625, "learning_rate": 6.92504416095917e-06, "loss": 1.3898, "step": 7510 }, { "epoch": 0.376, "grad_norm": 2.0, "learning_rate": 6.9177787124947426e-06, "loss": 1.3951, "step": 7520 }, { "epoch": 0.3765, "grad_norm": 1.921875, "learning_rate": 6.910508513117338e-06, "loss": 1.3785, "step": 7530 }, { "epoch": 0.377, "grad_norm": 2.296875, "learning_rate": 6.90323358083742e-06, "loss": 1.3946, "step": 7540 }, { "epoch": 0.3775, "grad_norm": 1.90625, "learning_rate": 6.895953933677178e-06, "loss": 1.3803, "step": 7550 }, { "epoch": 0.378, "grad_norm": 2.09375, "learning_rate": 6.888669589670481e-06, "loss": 1.3884, "step": 7560 }, { "epoch": 0.3785, "grad_norm": 3.265625, "learning_rate": 6.881380566862835e-06, "loss": 1.3821, "step": 7570 }, { "epoch": 0.379, "grad_norm": 2.0625, "learning_rate": 6.874086883311335e-06, "loss": 1.3942, "step": 7580 }, { "epoch": 0.3795, "grad_norm": 1.84375, "learning_rate": 6.86678855708462e-06, "loss": 1.4022, "step": 7590 }, { "epoch": 0.38, "grad_norm": 1.8359375, "learning_rate": 6.859485606262834e-06, "loss": 1.4067, "step": 7600 }, { "epoch": 0.3805, "grad_norm": 1.9375, "learning_rate": 6.852178048937578e-06, "loss": 1.3751, "step": 7610 }, { "epoch": 0.381, "grad_norm": 1.890625, "learning_rate": 6.844865903211859e-06, "loss": 1.3976, "step": 7620 }, { "epoch": 0.3815, "grad_norm": 1.765625, "learning_rate": 6.837549187200059e-06, "loss": 1.3866, "step": 7630 }, { "epoch": 0.382, "grad_norm": 2.140625, "learning_rate": 6.830227919027876e-06, "loss": 1.3966, "step": 7640 }, { "epoch": 0.3825, "grad_norm": 1.828125, "learning_rate": 6.822902116832286e-06, "loss": 1.3924, "step": 7650 }, { "epoch": 0.383, "grad_norm": 2.359375, "learning_rate": 6.815571798761499e-06, "loss": 1.3959, "step": 7660 }, { "epoch": 0.3835, "grad_norm": 1.9140625, "learning_rate": 6.8082369829749095e-06, "loss": 1.3809, "step": 7670 }, { "epoch": 0.384, "grad_norm": 1.9453125, "learning_rate": 6.800897687643057e-06, "loss": 1.3964, "step": 7680 }, { "epoch": 0.3845, "grad_norm": 1.90625, "learning_rate": 6.7935539309475775e-06, "loss": 1.3934, "step": 7690 }, { "epoch": 0.385, "grad_norm": 1.9921875, "learning_rate": 6.786205731081158e-06, "loss": 1.3984, "step": 7700 }, { "epoch": 0.3855, "grad_norm": 1.8984375, "learning_rate": 6.778853106247492e-06, "loss": 1.3802, "step": 7710 }, { "epoch": 0.386, "grad_norm": 1.890625, "learning_rate": 6.771496074661239e-06, "loss": 1.3885, "step": 7720 }, { "epoch": 0.3865, "grad_norm": 1.75, "learning_rate": 6.764134654547969e-06, "loss": 1.3807, "step": 7730 }, { "epoch": 0.387, "grad_norm": 1.8515625, "learning_rate": 6.756768864144129e-06, "loss": 1.3857, "step": 7740 }, { "epoch": 0.3875, "grad_norm": 1.8203125, "learning_rate": 6.74939872169699e-06, "loss": 1.381, "step": 7750 }, { "epoch": 0.388, "grad_norm": 2.03125, "learning_rate": 6.7420242454646075e-06, "loss": 1.3989, "step": 7760 }, { "epoch": 0.3885, "grad_norm": 1.9375, "learning_rate": 6.734645453715768e-06, "loss": 1.3792, "step": 7770 }, { "epoch": 0.389, "grad_norm": 2.140625, "learning_rate": 6.72726236472995e-06, "loss": 1.41, "step": 7780 }, { "epoch": 0.3895, "grad_norm": 3.03125, "learning_rate": 6.7198749967972835e-06, "loss": 1.3844, "step": 7790 }, { "epoch": 0.39, "grad_norm": 1.9296875, "learning_rate": 6.712483368218492e-06, "loss": 1.3874, "step": 7800 }, { "epoch": 0.3905, "grad_norm": 1.9921875, "learning_rate": 6.705087497304853e-06, "loss": 1.3793, "step": 7810 }, { "epoch": 0.391, "grad_norm": 2.96875, "learning_rate": 6.697687402378161e-06, "loss": 1.3955, "step": 7820 }, { "epoch": 0.3915, "grad_norm": 2.171875, "learning_rate": 6.690283101770669e-06, "loss": 1.3737, "step": 7830 }, { "epoch": 0.392, "grad_norm": 1.9453125, "learning_rate": 6.68287461382505e-06, "loss": 1.3624, "step": 7840 }, { "epoch": 0.3925, "grad_norm": 2.078125, "learning_rate": 6.675461956894349e-06, "loss": 1.3818, "step": 7850 }, { "epoch": 0.393, "grad_norm": 1.9296875, "learning_rate": 6.668045149341943e-06, "loss": 1.3715, "step": 7860 }, { "epoch": 0.3935, "grad_norm": 1.984375, "learning_rate": 6.6606242095414856e-06, "loss": 1.374, "step": 7870 }, { "epoch": 0.394, "grad_norm": 1.703125, "learning_rate": 6.65319915587687e-06, "loss": 1.3798, "step": 7880 }, { "epoch": 0.3945, "grad_norm": 1.9609375, "learning_rate": 6.645770006742183e-06, "loss": 1.3887, "step": 7890 }, { "epoch": 0.395, "grad_norm": 1.6875, "learning_rate": 6.6383367805416535e-06, "loss": 1.3788, "step": 7900 }, { "epoch": 0.3955, "grad_norm": 1.9609375, "learning_rate": 6.6308994956896134e-06, "loss": 1.3849, "step": 7910 }, { "epoch": 0.396, "grad_norm": 1.8359375, "learning_rate": 6.623458170610445e-06, "loss": 1.3925, "step": 7920 }, { "epoch": 0.3965, "grad_norm": 1.921875, "learning_rate": 6.6160128237385436e-06, "loss": 1.3969, "step": 7930 }, { "epoch": 0.397, "grad_norm": 2.03125, "learning_rate": 6.608563473518266e-06, "loss": 1.371, "step": 7940 }, { "epoch": 0.3975, "grad_norm": 2.0, "learning_rate": 6.601110138403886e-06, "loss": 1.3887, "step": 7950 }, { "epoch": 0.398, "grad_norm": 2.140625, "learning_rate": 6.59365283685955e-06, "loss": 1.3802, "step": 7960 }, { "epoch": 0.3985, "grad_norm": 1.8984375, "learning_rate": 6.5861915873592316e-06, "loss": 1.393, "step": 7970 }, { "epoch": 0.399, "grad_norm": 1.9609375, "learning_rate": 6.578726408386682e-06, "loss": 1.3802, "step": 7980 }, { "epoch": 0.3995, "grad_norm": 2.15625, "learning_rate": 6.571257318435389e-06, "loss": 1.3822, "step": 7990 }, { "epoch": 0.4, "grad_norm": 1.8671875, "learning_rate": 6.563784336008528e-06, "loss": 1.399, "step": 8000 }, { "epoch": 0.4005, "grad_norm": 1.8828125, "learning_rate": 6.556307479618915e-06, "loss": 1.3814, "step": 8010 }, { "epoch": 0.401, "grad_norm": 1.859375, "learning_rate": 6.548826767788968e-06, "loss": 1.3771, "step": 8020 }, { "epoch": 0.4015, "grad_norm": 1.8828125, "learning_rate": 6.541342219050655e-06, "loss": 1.3803, "step": 8030 }, { "epoch": 0.402, "grad_norm": 1.859375, "learning_rate": 6.5338538519454444e-06, "loss": 1.377, "step": 8040 }, { "epoch": 0.4025, "grad_norm": 1.8359375, "learning_rate": 6.52636168502427e-06, "loss": 1.3917, "step": 8050 }, { "epoch": 0.403, "grad_norm": 2.3125, "learning_rate": 6.5188657368474715e-06, "loss": 1.3842, "step": 8060 }, { "epoch": 0.4035, "grad_norm": 1.8671875, "learning_rate": 6.511366025984766e-06, "loss": 1.3853, "step": 8070 }, { "epoch": 0.404, "grad_norm": 2.140625, "learning_rate": 6.503862571015183e-06, "loss": 1.3917, "step": 8080 }, { "epoch": 0.4045, "grad_norm": 1.75, "learning_rate": 6.496355390527033e-06, "loss": 1.3886, "step": 8090 }, { "epoch": 0.405, "grad_norm": 1.703125, "learning_rate": 6.48884450311785e-06, "loss": 1.3888, "step": 8100 }, { "epoch": 0.4055, "grad_norm": 2.015625, "learning_rate": 6.4813299273943564e-06, "loss": 1.4094, "step": 8110 }, { "epoch": 0.406, "grad_norm": 2.09375, "learning_rate": 6.473811681972411e-06, "loss": 1.3892, "step": 8120 }, { "epoch": 0.4065, "grad_norm": 2.09375, "learning_rate": 6.46628978547696e-06, "loss": 1.403, "step": 8130 }, { "epoch": 0.407, "grad_norm": 2.296875, "learning_rate": 6.458764256541996e-06, "loss": 1.3784, "step": 8140 }, { "epoch": 0.4075, "grad_norm": 2.015625, "learning_rate": 6.451235113810515e-06, "loss": 1.393, "step": 8150 }, { "epoch": 0.408, "grad_norm": 2.015625, "learning_rate": 6.443702375934457e-06, "loss": 1.4011, "step": 8160 }, { "epoch": 0.4085, "grad_norm": 1.9140625, "learning_rate": 6.436166061574675e-06, "loss": 1.3882, "step": 8170 }, { "epoch": 0.409, "grad_norm": 1.984375, "learning_rate": 6.428626189400881e-06, "loss": 1.3989, "step": 8180 }, { "epoch": 0.4095, "grad_norm": 1.8515625, "learning_rate": 6.4210827780915975e-06, "loss": 1.4008, "step": 8190 }, { "epoch": 0.41, "grad_norm": 3.484375, "learning_rate": 6.413535846334113e-06, "loss": 1.3906, "step": 8200 }, { "epoch": 0.4105, "grad_norm": 2.15625, "learning_rate": 6.4059854128244495e-06, "loss": 1.3901, "step": 8210 }, { "epoch": 0.411, "grad_norm": 1.75, "learning_rate": 6.398431496267287e-06, "loss": 1.3977, "step": 8220 }, { "epoch": 0.4115, "grad_norm": 1.8671875, "learning_rate": 6.390874115375945e-06, "loss": 1.3884, "step": 8230 }, { "epoch": 0.412, "grad_norm": 1.921875, "learning_rate": 6.383313288872321e-06, "loss": 1.3913, "step": 8240 }, { "epoch": 0.4125, "grad_norm": 2.3125, "learning_rate": 6.375749035486853e-06, "loss": 1.3804, "step": 8250 }, { "epoch": 0.413, "grad_norm": 1.8671875, "learning_rate": 6.3681813739584605e-06, "loss": 1.3897, "step": 8260 }, { "epoch": 0.4135, "grad_norm": 1.9140625, "learning_rate": 6.360610323034515e-06, "loss": 1.3808, "step": 8270 }, { "epoch": 0.414, "grad_norm": 2.21875, "learning_rate": 6.353035901470778e-06, "loss": 1.3858, "step": 8280 }, { "epoch": 0.4145, "grad_norm": 2.140625, "learning_rate": 6.345458128031361e-06, "loss": 1.3841, "step": 8290 }, { "epoch": 0.415, "grad_norm": 1.7734375, "learning_rate": 6.337877021488684e-06, "loss": 1.3793, "step": 8300 }, { "epoch": 0.4155, "grad_norm": 1.890625, "learning_rate": 6.330292600623423e-06, "loss": 1.3814, "step": 8310 }, { "epoch": 0.416, "grad_norm": 2.1875, "learning_rate": 6.322704884224455e-06, "loss": 1.3902, "step": 8320 }, { "epoch": 0.4165, "grad_norm": 1.8046875, "learning_rate": 6.31511389108884e-06, "loss": 1.3813, "step": 8330 }, { "epoch": 0.417, "grad_norm": 1.796875, "learning_rate": 6.307519640021735e-06, "loss": 1.3904, "step": 8340 }, { "epoch": 0.4175, "grad_norm": 1.7265625, "learning_rate": 6.299922149836383e-06, "loss": 1.389, "step": 8350 }, { "epoch": 0.418, "grad_norm": 2.015625, "learning_rate": 6.292321439354043e-06, "loss": 1.383, "step": 8360 }, { "epoch": 0.4185, "grad_norm": 1.890625, "learning_rate": 6.284717527403956e-06, "loss": 1.3869, "step": 8370 }, { "epoch": 0.419, "grad_norm": 2.03125, "learning_rate": 6.277110432823289e-06, "loss": 1.3779, "step": 8380 }, { "epoch": 0.4195, "grad_norm": 2.03125, "learning_rate": 6.2695001744571e-06, "loss": 1.3781, "step": 8390 }, { "epoch": 0.42, "grad_norm": 2.09375, "learning_rate": 6.261886771158279e-06, "loss": 1.3973, "step": 8400 }, { "epoch": 0.4205, "grad_norm": 1.828125, "learning_rate": 6.25427024178751e-06, "loss": 1.3997, "step": 8410 }, { "epoch": 0.421, "grad_norm": 2.28125, "learning_rate": 6.24665060521322e-06, "loss": 1.3862, "step": 8420 }, { "epoch": 0.4215, "grad_norm": 1.7890625, "learning_rate": 6.239027880311533e-06, "loss": 1.3756, "step": 8430 }, { "epoch": 0.422, "grad_norm": 1.8125, "learning_rate": 6.231402085966226e-06, "loss": 1.4024, "step": 8440 }, { "epoch": 0.4225, "grad_norm": 1.859375, "learning_rate": 6.223773241068678e-06, "loss": 1.3754, "step": 8450 }, { "epoch": 0.423, "grad_norm": 1.9453125, "learning_rate": 6.2161413645178245e-06, "loss": 1.3774, "step": 8460 }, { "epoch": 0.4235, "grad_norm": 1.796875, "learning_rate": 6.208506475220113e-06, "loss": 1.379, "step": 8470 }, { "epoch": 0.424, "grad_norm": 1.8515625, "learning_rate": 6.200868592089455e-06, "loss": 1.3839, "step": 8480 }, { "epoch": 0.4245, "grad_norm": 1.9609375, "learning_rate": 6.193227734047176e-06, "loss": 1.391, "step": 8490 }, { "epoch": 0.425, "grad_norm": 1.8125, "learning_rate": 6.185583920021973e-06, "loss": 1.3811, "step": 8500 }, { "epoch": 0.4255, "grad_norm": 1.875, "learning_rate": 6.177937168949864e-06, "loss": 1.3777, "step": 8510 }, { "epoch": 0.426, "grad_norm": 2.078125, "learning_rate": 6.1702874997741505e-06, "loss": 1.3931, "step": 8520 }, { "epoch": 0.4265, "grad_norm": 1.8671875, "learning_rate": 6.162634931445349e-06, "loss": 1.3905, "step": 8530 }, { "epoch": 0.427, "grad_norm": 1.8671875, "learning_rate": 6.1549794829211715e-06, "loss": 1.3709, "step": 8540 }, { "epoch": 0.4275, "grad_norm": 1.6953125, "learning_rate": 6.147321173166457e-06, "loss": 1.3903, "step": 8550 }, { "epoch": 0.428, "grad_norm": 1.9140625, "learning_rate": 6.139660021153138e-06, "loss": 1.3849, "step": 8560 }, { "epoch": 0.4285, "grad_norm": 2.078125, "learning_rate": 6.131996045860182e-06, "loss": 1.4017, "step": 8570 }, { "epoch": 0.429, "grad_norm": 1.921875, "learning_rate": 6.124329266273559e-06, "loss": 1.3851, "step": 8580 }, { "epoch": 0.4295, "grad_norm": 2.234375, "learning_rate": 6.116659701386175e-06, "loss": 1.4021, "step": 8590 }, { "epoch": 0.43, "grad_norm": 1.828125, "learning_rate": 6.108987370197848e-06, "loss": 1.4035, "step": 8600 }, { "epoch": 0.4305, "grad_norm": 2.453125, "learning_rate": 6.10131229171524e-06, "loss": 1.3905, "step": 8610 }, { "epoch": 0.431, "grad_norm": 2.03125, "learning_rate": 6.093634484951822e-06, "loss": 1.37, "step": 8620 }, { "epoch": 0.4315, "grad_norm": 2.046875, "learning_rate": 6.0859539689278235e-06, "loss": 1.3915, "step": 8630 }, { "epoch": 0.432, "grad_norm": 1.8203125, "learning_rate": 6.078270762670189e-06, "loss": 1.3765, "step": 8640 }, { "epoch": 0.4325, "grad_norm": 1.9296875, "learning_rate": 6.070584885212519e-06, "loss": 1.3842, "step": 8650 }, { "epoch": 0.433, "grad_norm": 2.046875, "learning_rate": 6.062896355595043e-06, "loss": 1.3981, "step": 8660 }, { "epoch": 0.4335, "grad_norm": 1.78125, "learning_rate": 6.055205192864551e-06, "loss": 1.3753, "step": 8670 }, { "epoch": 0.434, "grad_norm": 1.9140625, "learning_rate": 6.0475114160743585e-06, "loss": 1.3746, "step": 8680 }, { "epoch": 0.4345, "grad_norm": 1.859375, "learning_rate": 6.039815044284259e-06, "loss": 1.3975, "step": 8690 }, { "epoch": 0.435, "grad_norm": 1.8125, "learning_rate": 6.032116096560476e-06, "loss": 1.3777, "step": 8700 }, { "epoch": 0.4355, "grad_norm": 1.8671875, "learning_rate": 6.0244145919756095e-06, "loss": 1.3886, "step": 8710 }, { "epoch": 0.436, "grad_norm": 1.9765625, "learning_rate": 6.016710549608595e-06, "loss": 1.387, "step": 8720 }, { "epoch": 0.4365, "grad_norm": 1.875, "learning_rate": 6.009003988544656e-06, "loss": 1.3932, "step": 8730 }, { "epoch": 0.437, "grad_norm": 1.9375, "learning_rate": 6.001294927875257e-06, "loss": 1.3699, "step": 8740 }, { "epoch": 0.4375, "grad_norm": 1.8515625, "learning_rate": 5.993583386698052e-06, "loss": 1.3888, "step": 8750 }, { "epoch": 0.438, "grad_norm": 1.890625, "learning_rate": 5.985869384116843e-06, "loss": 1.3794, "step": 8760 }, { "epoch": 0.4385, "grad_norm": 1.765625, "learning_rate": 5.978152939241522e-06, "loss": 1.3922, "step": 8770 }, { "epoch": 0.439, "grad_norm": 2.265625, "learning_rate": 5.970434071188044e-06, "loss": 1.3856, "step": 8780 }, { "epoch": 0.4395, "grad_norm": 2.828125, "learning_rate": 5.962712799078355e-06, "loss": 1.3751, "step": 8790 }, { "epoch": 0.44, "grad_norm": 1.9375, "learning_rate": 5.954989142040364e-06, "loss": 1.3855, "step": 8800 }, { "epoch": 0.4405, "grad_norm": 1.75, "learning_rate": 5.9472631192078834e-06, "loss": 1.3857, "step": 8810 }, { "epoch": 0.441, "grad_norm": 2.1875, "learning_rate": 5.939534749720592e-06, "loss": 1.3918, "step": 8820 }, { "epoch": 0.4415, "grad_norm": 2.140625, "learning_rate": 5.931804052723976e-06, "loss": 1.3842, "step": 8830 }, { "epoch": 0.442, "grad_norm": 1.875, "learning_rate": 5.9240710473692905e-06, "loss": 1.376, "step": 8840 }, { "epoch": 0.4425, "grad_norm": 1.859375, "learning_rate": 5.916335752813509e-06, "loss": 1.3919, "step": 8850 }, { "epoch": 0.443, "grad_norm": 2.078125, "learning_rate": 5.908598188219277e-06, "loss": 1.3792, "step": 8860 }, { "epoch": 0.4435, "grad_norm": 1.984375, "learning_rate": 5.9008583727548596e-06, "loss": 1.4011, "step": 8870 }, { "epoch": 0.444, "grad_norm": 1.9296875, "learning_rate": 5.893116325594105e-06, "loss": 1.3738, "step": 8880 }, { "epoch": 0.4445, "grad_norm": 1.984375, "learning_rate": 5.885372065916381e-06, "loss": 1.3786, "step": 8890 }, { "epoch": 0.445, "grad_norm": 2.3125, "learning_rate": 5.877625612906543e-06, "loss": 1.3756, "step": 8900 }, { "epoch": 0.4455, "grad_norm": 1.9921875, "learning_rate": 5.8698769857548785e-06, "loss": 1.382, "step": 8910 }, { "epoch": 0.446, "grad_norm": 1.7734375, "learning_rate": 5.862126203657062e-06, "loss": 1.3805, "step": 8920 }, { "epoch": 0.4465, "grad_norm": 1.96875, "learning_rate": 5.854373285814103e-06, "loss": 1.3904, "step": 8930 }, { "epoch": 0.447, "grad_norm": 1.6953125, "learning_rate": 5.846618251432305e-06, "loss": 1.3624, "step": 8940 }, { "epoch": 0.4475, "grad_norm": 1.9296875, "learning_rate": 5.838861119723211e-06, "loss": 1.3834, "step": 8950 }, { "epoch": 0.448, "grad_norm": 1.796875, "learning_rate": 5.831101909903567e-06, "loss": 1.3821, "step": 8960 }, { "epoch": 0.4485, "grad_norm": 1.9765625, "learning_rate": 5.82334064119526e-06, "loss": 1.3983, "step": 8970 }, { "epoch": 0.449, "grad_norm": 1.96875, "learning_rate": 5.81557733282528e-06, "loss": 1.368, "step": 8980 }, { "epoch": 0.4495, "grad_norm": 2.015625, "learning_rate": 5.807812004025671e-06, "loss": 1.3844, "step": 8990 }, { "epoch": 0.45, "grad_norm": 1.7109375, "learning_rate": 5.8000446740334786e-06, "loss": 1.3862, "step": 9000 }, { "epoch": 0.4505, "grad_norm": 1.8515625, "learning_rate": 5.79227536209071e-06, "loss": 1.3711, "step": 9010 }, { "epoch": 0.451, "grad_norm": 2.0, "learning_rate": 5.784504087444283e-06, "loss": 1.3756, "step": 9020 }, { "epoch": 0.4515, "grad_norm": 2.078125, "learning_rate": 5.776730869345973e-06, "loss": 1.3794, "step": 9030 }, { "epoch": 0.452, "grad_norm": 1.921875, "learning_rate": 5.768955727052372e-06, "loss": 1.3729, "step": 9040 }, { "epoch": 0.4525, "grad_norm": 2.03125, "learning_rate": 5.761178679824839e-06, "loss": 1.4037, "step": 9050 }, { "epoch": 0.453, "grad_norm": 1.953125, "learning_rate": 5.753399746929455e-06, "loss": 1.4116, "step": 9060 }, { "epoch": 0.4535, "grad_norm": 1.625, "learning_rate": 5.745618947636965e-06, "loss": 1.3768, "step": 9070 }, { "epoch": 0.454, "grad_norm": 1.9296875, "learning_rate": 5.737836301222746e-06, "loss": 1.3798, "step": 9080 }, { "epoch": 0.4545, "grad_norm": 2.015625, "learning_rate": 5.730051826966746e-06, "loss": 1.3942, "step": 9090 }, { "epoch": 0.455, "grad_norm": 2.234375, "learning_rate": 5.7222655441534415e-06, "loss": 1.4141, "step": 9100 }, { "epoch": 0.4555, "grad_norm": 2.0, "learning_rate": 5.714477472071789e-06, "loss": 1.3854, "step": 9110 }, { "epoch": 0.456, "grad_norm": 2.015625, "learning_rate": 5.706687630015181e-06, "loss": 1.3893, "step": 9120 }, { "epoch": 0.4565, "grad_norm": 1.8515625, "learning_rate": 5.698896037281391e-06, "loss": 1.3977, "step": 9130 }, { "epoch": 0.457, "grad_norm": 1.9375, "learning_rate": 5.691102713172529e-06, "loss": 1.4048, "step": 9140 }, { "epoch": 0.4575, "grad_norm": 1.859375, "learning_rate": 5.683307676995001e-06, "loss": 1.3759, "step": 9150 }, { "epoch": 0.458, "grad_norm": 2.0625, "learning_rate": 5.6755109480594435e-06, "loss": 1.3909, "step": 9160 }, { "epoch": 0.4585, "grad_norm": 2.109375, "learning_rate": 5.667712545680696e-06, "loss": 1.3792, "step": 9170 }, { "epoch": 0.459, "grad_norm": 1.890625, "learning_rate": 5.659912489177738e-06, "loss": 1.3992, "step": 9180 }, { "epoch": 0.4595, "grad_norm": 1.84375, "learning_rate": 5.652110797873649e-06, "loss": 1.3895, "step": 9190 }, { "epoch": 0.46, "grad_norm": 1.9921875, "learning_rate": 5.644307491095557e-06, "loss": 1.3972, "step": 9200 }, { "epoch": 0.4605, "grad_norm": 1.9140625, "learning_rate": 5.636502588174595e-06, "loss": 1.3896, "step": 9210 }, { "epoch": 0.461, "grad_norm": 1.875, "learning_rate": 5.628696108445847e-06, "loss": 1.3643, "step": 9220 }, { "epoch": 0.4615, "grad_norm": 2.125, "learning_rate": 5.620888071248303e-06, "loss": 1.3968, "step": 9230 }, { "epoch": 0.462, "grad_norm": 1.8515625, "learning_rate": 5.613078495924814e-06, "loss": 1.3727, "step": 9240 }, { "epoch": 0.4625, "grad_norm": 2.421875, "learning_rate": 5.605267401822041e-06, "loss": 1.3928, "step": 9250 }, { "epoch": 0.463, "grad_norm": 2.015625, "learning_rate": 5.597454808290403e-06, "loss": 1.385, "step": 9260 }, { "epoch": 0.4635, "grad_norm": 1.8984375, "learning_rate": 5.589640734684043e-06, "loss": 1.3987, "step": 9270 }, { "epoch": 0.464, "grad_norm": 1.875, "learning_rate": 5.5818252003607585e-06, "loss": 1.3853, "step": 9280 }, { "epoch": 0.4645, "grad_norm": 2.0, "learning_rate": 5.574008224681975e-06, "loss": 1.388, "step": 9290 }, { "epoch": 0.465, "grad_norm": 1.9609375, "learning_rate": 5.566189827012685e-06, "loss": 1.396, "step": 9300 }, { "epoch": 0.4655, "grad_norm": 1.9765625, "learning_rate": 5.558370026721404e-06, "loss": 1.3702, "step": 9310 }, { "epoch": 0.466, "grad_norm": 1.96875, "learning_rate": 5.550548843180121e-06, "loss": 1.3861, "step": 9320 }, { "epoch": 0.4665, "grad_norm": 2.171875, "learning_rate": 5.542726295764255e-06, "loss": 1.3713, "step": 9330 }, { "epoch": 0.467, "grad_norm": 2.0, "learning_rate": 5.5349024038526e-06, "loss": 1.3922, "step": 9340 }, { "epoch": 0.4675, "grad_norm": 1.9375, "learning_rate": 5.5270771868272836e-06, "loss": 1.3899, "step": 9350 }, { "epoch": 0.468, "grad_norm": 1.90625, "learning_rate": 5.519250664073715e-06, "loss": 1.393, "step": 9360 }, { "epoch": 0.4685, "grad_norm": 2.078125, "learning_rate": 5.511422854980539e-06, "loss": 1.3866, "step": 9370 }, { "epoch": 0.469, "grad_norm": 1.9453125, "learning_rate": 5.503593778939583e-06, "loss": 1.3669, "step": 9380 }, { "epoch": 0.4695, "grad_norm": 1.9609375, "learning_rate": 5.495763455345821e-06, "loss": 1.3809, "step": 9390 }, { "epoch": 0.47, "grad_norm": 2.0, "learning_rate": 5.487931903597309e-06, "loss": 1.3964, "step": 9400 }, { "epoch": 0.4705, "grad_norm": 1.9375, "learning_rate": 5.480099143095149e-06, "loss": 1.3723, "step": 9410 }, { "epoch": 0.471, "grad_norm": 1.78125, "learning_rate": 5.472265193243439e-06, "loss": 1.3906, "step": 9420 }, { "epoch": 0.4715, "grad_norm": 1.9921875, "learning_rate": 5.464430073449224e-06, "loss": 1.387, "step": 9430 }, { "epoch": 0.472, "grad_norm": 2.125, "learning_rate": 5.456593803122441e-06, "loss": 1.3723, "step": 9440 }, { "epoch": 0.4725, "grad_norm": 2.171875, "learning_rate": 5.448756401675884e-06, "loss": 1.3885, "step": 9450 }, { "epoch": 0.473, "grad_norm": 1.9140625, "learning_rate": 5.440917888525146e-06, "loss": 1.3823, "step": 9460 }, { "epoch": 0.4735, "grad_norm": 1.8125, "learning_rate": 5.4330782830885774e-06, "loss": 1.396, "step": 9470 }, { "epoch": 0.474, "grad_norm": 2.0625, "learning_rate": 5.425237604787228e-06, "loss": 1.3677, "step": 9480 }, { "epoch": 0.4745, "grad_norm": 2.125, "learning_rate": 5.417395873044812e-06, "loss": 1.3871, "step": 9490 }, { "epoch": 0.475, "grad_norm": 1.8984375, "learning_rate": 5.4095531072876466e-06, "loss": 1.3961, "step": 9500 }, { "epoch": 0.4755, "grad_norm": 1.875, "learning_rate": 5.40170932694462e-06, "loss": 1.3715, "step": 9510 }, { "epoch": 0.476, "grad_norm": 2.0, "learning_rate": 5.393864551447123e-06, "loss": 1.3881, "step": 9520 }, { "epoch": 0.4765, "grad_norm": 1.8671875, "learning_rate": 5.386018800229022e-06, "loss": 1.3842, "step": 9530 }, { "epoch": 0.477, "grad_norm": 1.953125, "learning_rate": 5.378172092726592e-06, "loss": 1.3973, "step": 9540 }, { "epoch": 0.4775, "grad_norm": 1.9765625, "learning_rate": 5.370324448378481e-06, "loss": 1.4023, "step": 9550 }, { "epoch": 0.478, "grad_norm": 1.9765625, "learning_rate": 5.362475886625657e-06, "loss": 1.3922, "step": 9560 }, { "epoch": 0.4785, "grad_norm": 2.078125, "learning_rate": 5.3546264269113646e-06, "loss": 1.3778, "step": 9570 }, { "epoch": 0.479, "grad_norm": 2.0, "learning_rate": 5.346776088681067e-06, "loss": 1.3676, "step": 9580 }, { "epoch": 0.4795, "grad_norm": 1.7421875, "learning_rate": 5.338924891382404e-06, "loss": 1.3832, "step": 9590 }, { "epoch": 0.48, "grad_norm": 1.8359375, "learning_rate": 5.3310728544651495e-06, "loss": 1.3756, "step": 9600 }, { "epoch": 0.4805, "grad_norm": 1.984375, "learning_rate": 5.32321999738115e-06, "loss": 1.4089, "step": 9610 }, { "epoch": 0.481, "grad_norm": 2.125, "learning_rate": 5.315366339584292e-06, "loss": 1.3943, "step": 9620 }, { "epoch": 0.4815, "grad_norm": 1.8984375, "learning_rate": 5.307511900530436e-06, "loss": 1.3907, "step": 9630 }, { "epoch": 0.482, "grad_norm": 1.890625, "learning_rate": 5.299656699677387e-06, "loss": 1.401, "step": 9640 }, { "epoch": 0.4825, "grad_norm": 1.9140625, "learning_rate": 5.2918007564848295e-06, "loss": 1.4003, "step": 9650 }, { "epoch": 0.483, "grad_norm": 1.90625, "learning_rate": 5.283944090414295e-06, "loss": 1.3922, "step": 9660 }, { "epoch": 0.4835, "grad_norm": 1.9453125, "learning_rate": 5.276086720929098e-06, "loss": 1.3984, "step": 9670 }, { "epoch": 0.484, "grad_norm": 1.8828125, "learning_rate": 5.268228667494299e-06, "loss": 1.3962, "step": 9680 }, { "epoch": 0.4845, "grad_norm": 1.90625, "learning_rate": 5.260369949576656e-06, "loss": 1.3892, "step": 9690 }, { "epoch": 0.485, "grad_norm": 1.9296875, "learning_rate": 5.252510586644567e-06, "loss": 1.3894, "step": 9700 }, { "epoch": 0.4855, "grad_norm": 2.125, "learning_rate": 5.244650598168031e-06, "loss": 1.3893, "step": 9710 }, { "epoch": 0.486, "grad_norm": 1.921875, "learning_rate": 5.236790003618598e-06, "loss": 1.3807, "step": 9720 }, { "epoch": 0.4865, "grad_norm": 2.078125, "learning_rate": 5.228928822469317e-06, "loss": 1.3866, "step": 9730 }, { "epoch": 0.487, "grad_norm": 1.859375, "learning_rate": 5.22106707419469e-06, "loss": 1.3884, "step": 9740 }, { "epoch": 0.4875, "grad_norm": 2.0, "learning_rate": 5.2132047782706275e-06, "loss": 1.3975, "step": 9750 }, { "epoch": 0.488, "grad_norm": 1.875, "learning_rate": 5.205341954174394e-06, "loss": 1.3833, "step": 9760 }, { "epoch": 0.4885, "grad_norm": 1.9921875, "learning_rate": 5.19747862138456e-06, "loss": 1.4089, "step": 9770 }, { "epoch": 0.489, "grad_norm": 1.9140625, "learning_rate": 5.189614799380962e-06, "loss": 1.3874, "step": 9780 }, { "epoch": 0.4895, "grad_norm": 2.234375, "learning_rate": 5.181750507644644e-06, "loss": 1.3811, "step": 9790 }, { "epoch": 0.49, "grad_norm": 1.921875, "learning_rate": 5.173885765657815e-06, "loss": 1.3927, "step": 9800 }, { "epoch": 0.4905, "grad_norm": 4.03125, "learning_rate": 5.166020592903798e-06, "loss": 1.3837, "step": 9810 }, { "epoch": 0.491, "grad_norm": 2.03125, "learning_rate": 5.158155008866988e-06, "loss": 1.3908, "step": 9820 }, { "epoch": 0.4915, "grad_norm": 2.109375, "learning_rate": 5.15028903303279e-06, "loss": 1.3841, "step": 9830 }, { "epoch": 0.492, "grad_norm": 1.75, "learning_rate": 5.142422684887592e-06, "loss": 1.3784, "step": 9840 }, { "epoch": 0.4925, "grad_norm": 1.9296875, "learning_rate": 5.134555983918692e-06, "loss": 1.3841, "step": 9850 }, { "epoch": 0.493, "grad_norm": 1.9921875, "learning_rate": 5.126688949614269e-06, "loss": 1.4022, "step": 9860 }, { "epoch": 0.4935, "grad_norm": 2.390625, "learning_rate": 5.118821601463326e-06, "loss": 1.3808, "step": 9870 }, { "epoch": 0.494, "grad_norm": 2.046875, "learning_rate": 5.110953958955645e-06, "loss": 1.3891, "step": 9880 }, { "epoch": 0.4945, "grad_norm": 1.8359375, "learning_rate": 5.103086041581734e-06, "loss": 1.3827, "step": 9890 }, { "epoch": 0.495, "grad_norm": 2.859375, "learning_rate": 5.095217868832785e-06, "loss": 1.4015, "step": 9900 }, { "epoch": 0.4955, "grad_norm": 2.15625, "learning_rate": 5.087349460200619e-06, "loss": 1.3719, "step": 9910 }, { "epoch": 0.496, "grad_norm": 1.859375, "learning_rate": 5.079480835177647e-06, "loss": 1.3784, "step": 9920 }, { "epoch": 0.4965, "grad_norm": 1.984375, "learning_rate": 5.071612013256811e-06, "loss": 1.3778, "step": 9930 }, { "epoch": 0.497, "grad_norm": 2.484375, "learning_rate": 5.063743013931544e-06, "loss": 1.3826, "step": 9940 }, { "epoch": 0.4975, "grad_norm": 2.0, "learning_rate": 5.055873856695716e-06, "loss": 1.3891, "step": 9950 }, { "epoch": 0.498, "grad_norm": 1.953125, "learning_rate": 5.048004561043588e-06, "loss": 1.3739, "step": 9960 }, { "epoch": 0.4985, "grad_norm": 1.96875, "learning_rate": 5.0401351464697675e-06, "loss": 1.4025, "step": 9970 }, { "epoch": 0.499, "grad_norm": 2.1875, "learning_rate": 5.032265632469153e-06, "loss": 1.3884, "step": 9980 }, { "epoch": 0.4995, "grad_norm": 1.9921875, "learning_rate": 5.024396038536892e-06, "loss": 1.39, "step": 9990 }, { "epoch": 0.5, "grad_norm": 2.09375, "learning_rate": 5.016526384168328e-06, "loss": 1.3899, "step": 10000 }, { "epoch": 0.5005, "grad_norm": 1.9765625, "learning_rate": 5.008656688858953e-06, "loss": 1.3911, "step": 10010 }, { "epoch": 0.501, "grad_norm": 1.890625, "learning_rate": 5.000786972104365e-06, "loss": 1.3903, "step": 10020 }, { "epoch": 0.5015, "grad_norm": 2.0, "learning_rate": 4.99291725340021e-06, "loss": 1.396, "step": 10030 }, { "epoch": 0.502, "grad_norm": 1.9765625, "learning_rate": 4.9850475522421416e-06, "loss": 1.4007, "step": 10040 }, { "epoch": 0.5025, "grad_norm": 2.03125, "learning_rate": 4.977177888125771e-06, "loss": 1.3739, "step": 10050 }, { "epoch": 0.503, "grad_norm": 1.921875, "learning_rate": 4.969308280546616e-06, "loss": 1.3675, "step": 10060 }, { "epoch": 0.5035, "grad_norm": 2.0625, "learning_rate": 4.9614387490000545e-06, "loss": 1.3937, "step": 10070 }, { "epoch": 0.504, "grad_norm": 2.109375, "learning_rate": 4.953569312981275e-06, "loss": 1.392, "step": 10080 }, { "epoch": 0.5045, "grad_norm": 1.84375, "learning_rate": 4.94569999198523e-06, "loss": 1.3692, "step": 10090 }, { "epoch": 0.505, "grad_norm": 1.9765625, "learning_rate": 4.93783080550659e-06, "loss": 1.3869, "step": 10100 }, { "epoch": 0.5055, "grad_norm": 2.0625, "learning_rate": 4.929961773039687e-06, "loss": 1.3802, "step": 10110 }, { "epoch": 0.506, "grad_norm": 1.84375, "learning_rate": 4.922092914078475e-06, "loss": 1.3719, "step": 10120 }, { "epoch": 0.5065, "grad_norm": 2.015625, "learning_rate": 4.914224248116478e-06, "loss": 1.3898, "step": 10130 }, { "epoch": 0.507, "grad_norm": 1.9609375, "learning_rate": 4.906355794646742e-06, "loss": 1.3894, "step": 10140 }, { "epoch": 0.5075, "grad_norm": 1.75, "learning_rate": 4.8984875731617845e-06, "loss": 1.3965, "step": 10150 }, { "epoch": 0.508, "grad_norm": 1.9921875, "learning_rate": 4.890619603153549e-06, "loss": 1.3894, "step": 10160 }, { "epoch": 0.5085, "grad_norm": 1.9140625, "learning_rate": 4.882751904113357e-06, "loss": 1.4005, "step": 10170 }, { "epoch": 0.509, "grad_norm": 2.0625, "learning_rate": 4.874884495531862e-06, "loss": 1.3641, "step": 10180 }, { "epoch": 0.5095, "grad_norm": 2.046875, "learning_rate": 4.867017396898988e-06, "loss": 1.3688, "step": 10190 }, { "epoch": 0.51, "grad_norm": 1.9296875, "learning_rate": 4.859150627703903e-06, "loss": 1.3748, "step": 10200 }, { "epoch": 0.5105, "grad_norm": 1.8515625, "learning_rate": 4.85128420743495e-06, "loss": 1.3849, "step": 10210 }, { "epoch": 0.511, "grad_norm": 1.8125, "learning_rate": 4.843418155579614e-06, "loss": 1.3767, "step": 10220 }, { "epoch": 0.5115, "grad_norm": 1.6796875, "learning_rate": 4.83555249162446e-06, "loss": 1.3769, "step": 10230 }, { "epoch": 0.512, "grad_norm": 1.796875, "learning_rate": 4.827687235055101e-06, "loss": 1.3804, "step": 10240 }, { "epoch": 0.5125, "grad_norm": 1.9765625, "learning_rate": 4.819822405356133e-06, "loss": 1.3956, "step": 10250 }, { "epoch": 0.513, "grad_norm": 1.9765625, "learning_rate": 4.8119580220111e-06, "loss": 1.3758, "step": 10260 }, { "epoch": 0.5135, "grad_norm": 1.9609375, "learning_rate": 4.8040941045024355e-06, "loss": 1.3944, "step": 10270 }, { "epoch": 0.514, "grad_norm": 1.9921875, "learning_rate": 4.796230672311424e-06, "loss": 1.3759, "step": 10280 }, { "epoch": 0.5145, "grad_norm": 2.234375, "learning_rate": 4.788367744918142e-06, "loss": 1.3678, "step": 10290 }, { "epoch": 0.515, "grad_norm": 1.8671875, "learning_rate": 4.780505341801423e-06, "loss": 1.3894, "step": 10300 }, { "epoch": 0.5155, "grad_norm": 2.265625, "learning_rate": 4.772643482438792e-06, "loss": 1.3969, "step": 10310 }, { "epoch": 0.516, "grad_norm": 2.09375, "learning_rate": 4.764782186306435e-06, "loss": 1.3801, "step": 10320 }, { "epoch": 0.5165, "grad_norm": 1.7421875, "learning_rate": 4.756921472879141e-06, "loss": 1.385, "step": 10330 }, { "epoch": 0.517, "grad_norm": 1.8359375, "learning_rate": 4.749061361630254e-06, "loss": 1.3989, "step": 10340 }, { "epoch": 0.5175, "grad_norm": 1.9765625, "learning_rate": 4.741201872031625e-06, "loss": 1.3859, "step": 10350 }, { "epoch": 0.518, "grad_norm": 2.03125, "learning_rate": 4.733343023553568e-06, "loss": 1.3941, "step": 10360 }, { "epoch": 0.5185, "grad_norm": 1.9609375, "learning_rate": 4.7254848356648066e-06, "loss": 1.388, "step": 10370 }, { "epoch": 0.519, "grad_norm": 2.015625, "learning_rate": 4.717627327832431e-06, "loss": 1.3598, "step": 10380 }, { "epoch": 0.5195, "grad_norm": 1.9921875, "learning_rate": 4.7097705195218414e-06, "loss": 1.3847, "step": 10390 }, { "epoch": 0.52, "grad_norm": 2.046875, "learning_rate": 4.701914430196709e-06, "loss": 1.3712, "step": 10400 }, { "epoch": 0.5205, "grad_norm": 2.046875, "learning_rate": 4.6940590793189214e-06, "loss": 1.3905, "step": 10410 }, { "epoch": 0.521, "grad_norm": 2.109375, "learning_rate": 4.686204486348544e-06, "loss": 1.3753, "step": 10420 }, { "epoch": 0.5215, "grad_norm": 1.828125, "learning_rate": 4.678350670743754e-06, "loss": 1.384, "step": 10430 }, { "epoch": 0.522, "grad_norm": 2.0625, "learning_rate": 4.670497651960808e-06, "loss": 1.3929, "step": 10440 }, { "epoch": 0.5225, "grad_norm": 2.09375, "learning_rate": 4.662645449453989e-06, "loss": 1.3709, "step": 10450 }, { "epoch": 0.523, "grad_norm": 2.015625, "learning_rate": 4.6547940826755615e-06, "loss": 1.3759, "step": 10460 }, { "epoch": 0.5235, "grad_norm": 1.78125, "learning_rate": 4.64694357107571e-06, "loss": 1.3872, "step": 10470 }, { "epoch": 0.524, "grad_norm": 1.9609375, "learning_rate": 4.6390939341025075e-06, "loss": 1.3937, "step": 10480 }, { "epoch": 0.5245, "grad_norm": 1.9296875, "learning_rate": 4.631245191201859e-06, "loss": 1.3828, "step": 10490 }, { "epoch": 0.525, "grad_norm": 1.90625, "learning_rate": 4.623397361817456e-06, "loss": 1.3985, "step": 10500 }, { "epoch": 0.5255, "grad_norm": 1.8984375, "learning_rate": 4.615550465390723e-06, "loss": 1.4078, "step": 10510 }, { "epoch": 0.526, "grad_norm": 2.046875, "learning_rate": 4.6077045213607765e-06, "loss": 1.4079, "step": 10520 }, { "epoch": 0.5265, "grad_norm": 1.7734375, "learning_rate": 4.5998595491643705e-06, "loss": 1.3765, "step": 10530 }, { "epoch": 0.527, "grad_norm": 2.015625, "learning_rate": 4.5920155682358605e-06, "loss": 1.3994, "step": 10540 }, { "epoch": 0.5275, "grad_norm": 2.046875, "learning_rate": 4.584172598007132e-06, "loss": 1.3913, "step": 10550 }, { "epoch": 0.528, "grad_norm": 1.9140625, "learning_rate": 4.576330657907576e-06, "loss": 1.3705, "step": 10560 }, { "epoch": 0.5285, "grad_norm": 1.8203125, "learning_rate": 4.568489767364031e-06, "loss": 1.3757, "step": 10570 }, { "epoch": 0.529, "grad_norm": 1.84375, "learning_rate": 4.560649945800736e-06, "loss": 1.3648, "step": 10580 }, { "epoch": 0.5295, "grad_norm": 1.8203125, "learning_rate": 4.552811212639277e-06, "loss": 1.3706, "step": 10590 }, { "epoch": 0.53, "grad_norm": 2.1875, "learning_rate": 4.544973587298548e-06, "loss": 1.3843, "step": 10600 }, { "epoch": 0.5305, "grad_norm": 1.8046875, "learning_rate": 4.537137089194697e-06, "loss": 1.3787, "step": 10610 }, { "epoch": 0.531, "grad_norm": 1.9375, "learning_rate": 4.529301737741082e-06, "loss": 1.3826, "step": 10620 }, { "epoch": 0.5315, "grad_norm": 2.296875, "learning_rate": 4.521467552348213e-06, "loss": 1.3835, "step": 10630 }, { "epoch": 0.532, "grad_norm": 1.984375, "learning_rate": 4.513634552423721e-06, "loss": 1.3521, "step": 10640 }, { "epoch": 0.5325, "grad_norm": 1.8671875, "learning_rate": 4.505802757372295e-06, "loss": 1.3827, "step": 10650 }, { "epoch": 0.533, "grad_norm": 1.9453125, "learning_rate": 4.497972186595639e-06, "loss": 1.3946, "step": 10660 }, { "epoch": 0.5335, "grad_norm": 1.9375, "learning_rate": 4.490142859492425e-06, "loss": 1.3958, "step": 10670 }, { "epoch": 0.534, "grad_norm": 2.140625, "learning_rate": 4.482314795458245e-06, "loss": 1.3825, "step": 10680 }, { "epoch": 0.5345, "grad_norm": 2.15625, "learning_rate": 4.4744880138855625e-06, "loss": 1.3857, "step": 10690 }, { "epoch": 0.535, "grad_norm": 1.8671875, "learning_rate": 4.466662534163659e-06, "loss": 1.3764, "step": 10700 }, { "epoch": 0.5355, "grad_norm": 2.0, "learning_rate": 4.458838375678595e-06, "loss": 1.3893, "step": 10710 }, { "epoch": 0.536, "grad_norm": 2.0625, "learning_rate": 4.45101555781316e-06, "loss": 1.3855, "step": 10720 }, { "epoch": 0.5365, "grad_norm": 2.109375, "learning_rate": 4.443194099946819e-06, "loss": 1.382, "step": 10730 }, { "epoch": 0.537, "grad_norm": 1.7265625, "learning_rate": 4.435374021455668e-06, "loss": 1.3853, "step": 10740 }, { "epoch": 0.5375, "grad_norm": 1.7734375, "learning_rate": 4.4275553417123855e-06, "loss": 1.3801, "step": 10750 }, { "epoch": 0.538, "grad_norm": 1.8203125, "learning_rate": 4.419738080086188e-06, "loss": 1.3812, "step": 10760 }, { "epoch": 0.5385, "grad_norm": 1.9765625, "learning_rate": 4.411922255942776e-06, "loss": 1.3734, "step": 10770 }, { "epoch": 0.539, "grad_norm": 2.015625, "learning_rate": 4.4041078886442885e-06, "loss": 1.3612, "step": 10780 }, { "epoch": 0.5395, "grad_norm": 1.9375, "learning_rate": 4.396294997549259e-06, "loss": 1.3808, "step": 10790 }, { "epoch": 0.54, "grad_norm": 2.078125, "learning_rate": 4.388483602012557e-06, "loss": 1.3865, "step": 10800 }, { "epoch": 0.5405, "grad_norm": 1.8515625, "learning_rate": 4.38067372138536e-06, "loss": 1.3806, "step": 10810 }, { "epoch": 0.541, "grad_norm": 2.40625, "learning_rate": 4.372865375015077e-06, "loss": 1.3834, "step": 10820 }, { "epoch": 0.5415, "grad_norm": 1.953125, "learning_rate": 4.365058582245324e-06, "loss": 1.3614, "step": 10830 }, { "epoch": 0.542, "grad_norm": 1.96875, "learning_rate": 4.357253362415872e-06, "loss": 1.3852, "step": 10840 }, { "epoch": 0.5425, "grad_norm": 1.984375, "learning_rate": 4.349449734862591e-06, "loss": 1.384, "step": 10850 }, { "epoch": 0.543, "grad_norm": 2.03125, "learning_rate": 4.341647718917401e-06, "loss": 1.3834, "step": 10860 }, { "epoch": 0.5435, "grad_norm": 1.921875, "learning_rate": 4.33384733390824e-06, "loss": 1.3775, "step": 10870 }, { "epoch": 0.544, "grad_norm": 2.390625, "learning_rate": 4.3260485991589995e-06, "loss": 1.3689, "step": 10880 }, { "epoch": 0.5445, "grad_norm": 1.890625, "learning_rate": 4.3182515339894845e-06, "loss": 1.3801, "step": 10890 }, { "epoch": 0.545, "grad_norm": 1.9765625, "learning_rate": 4.310456157715364e-06, "loss": 1.3889, "step": 10900 }, { "epoch": 0.5455, "grad_norm": 1.9765625, "learning_rate": 4.302662489648121e-06, "loss": 1.3853, "step": 10910 }, { "epoch": 0.546, "grad_norm": 2.421875, "learning_rate": 4.294870549095009e-06, "loss": 1.3794, "step": 10920 }, { "epoch": 0.5465, "grad_norm": 1.734375, "learning_rate": 4.287080355359004e-06, "loss": 1.3731, "step": 10930 }, { "epoch": 0.547, "grad_norm": 2.03125, "learning_rate": 4.279291927738749e-06, "loss": 1.3835, "step": 10940 }, { "epoch": 0.5475, "grad_norm": 1.71875, "learning_rate": 4.271505285528515e-06, "loss": 1.3798, "step": 10950 }, { "epoch": 0.548, "grad_norm": 2.046875, "learning_rate": 4.26372044801815e-06, "loss": 1.3776, "step": 10960 }, { "epoch": 0.5485, "grad_norm": 2.15625, "learning_rate": 4.255937434493035e-06, "loss": 1.3964, "step": 10970 }, { "epoch": 0.549, "grad_norm": 2.171875, "learning_rate": 4.2481562642340225e-06, "loss": 1.3882, "step": 10980 }, { "epoch": 0.5495, "grad_norm": 1.828125, "learning_rate": 4.2403769565174075e-06, "loss": 1.3766, "step": 10990 }, { "epoch": 0.55, "grad_norm": 1.984375, "learning_rate": 4.2325995306148685e-06, "loss": 1.3708, "step": 11000 }, { "epoch": 0.5505, "grad_norm": 1.859375, "learning_rate": 4.224824005793421e-06, "loss": 1.3856, "step": 11010 }, { "epoch": 0.551, "grad_norm": 1.9453125, "learning_rate": 4.217050401315372e-06, "loss": 1.3847, "step": 11020 }, { "epoch": 0.5515, "grad_norm": 1.9296875, "learning_rate": 4.2092787364382705e-06, "loss": 1.3818, "step": 11030 }, { "epoch": 0.552, "grad_norm": 1.8671875, "learning_rate": 4.201509030414862e-06, "loss": 1.3784, "step": 11040 }, { "epoch": 0.5525, "grad_norm": 2.109375, "learning_rate": 4.193741302493037e-06, "loss": 1.3853, "step": 11050 }, { "epoch": 0.553, "grad_norm": 1.8515625, "learning_rate": 4.185975571915788e-06, "loss": 1.392, "step": 11060 }, { "epoch": 0.5535, "grad_norm": 2.109375, "learning_rate": 4.178211857921156e-06, "loss": 1.3684, "step": 11070 }, { "epoch": 0.554, "grad_norm": 1.8359375, "learning_rate": 4.170450179742192e-06, "loss": 1.3916, "step": 11080 }, { "epoch": 0.5545, "grad_norm": 1.9609375, "learning_rate": 4.162690556606902e-06, "loss": 1.388, "step": 11090 }, { "epoch": 0.555, "grad_norm": 2.015625, "learning_rate": 4.154933007738193e-06, "loss": 1.3853, "step": 11100 }, { "epoch": 0.5555, "grad_norm": 1.9296875, "learning_rate": 4.147177552353845e-06, "loss": 1.3943, "step": 11110 }, { "epoch": 0.556, "grad_norm": 2.046875, "learning_rate": 4.139424209666445e-06, "loss": 1.3694, "step": 11120 }, { "epoch": 0.5565, "grad_norm": 1.9296875, "learning_rate": 4.131672998883349e-06, "loss": 1.3882, "step": 11130 }, { "epoch": 0.557, "grad_norm": 2.0, "learning_rate": 4.123923939206629e-06, "loss": 1.4044, "step": 11140 }, { "epoch": 0.5575, "grad_norm": 1.953125, "learning_rate": 4.11617704983303e-06, "loss": 1.3923, "step": 11150 }, { "epoch": 0.558, "grad_norm": 1.8984375, "learning_rate": 4.10843234995392e-06, "loss": 1.3687, "step": 11160 }, { "epoch": 0.5585, "grad_norm": 1.9140625, "learning_rate": 4.100689858755246e-06, "loss": 1.3904, "step": 11170 }, { "epoch": 0.559, "grad_norm": 1.8671875, "learning_rate": 4.092949595417476e-06, "loss": 1.3807, "step": 11180 }, { "epoch": 0.5595, "grad_norm": 2.046875, "learning_rate": 4.085211579115566e-06, "loss": 1.3998, "step": 11190 }, { "epoch": 0.56, "grad_norm": 2.03125, "learning_rate": 4.077475829018903e-06, "loss": 1.3854, "step": 11200 }, { "epoch": 0.5605, "grad_norm": 1.9296875, "learning_rate": 4.069742364291258e-06, "loss": 1.4014, "step": 11210 }, { "epoch": 0.561, "grad_norm": 1.9140625, "learning_rate": 4.062011204090743e-06, "loss": 1.3875, "step": 11220 }, { "epoch": 0.5615, "grad_norm": 1.953125, "learning_rate": 4.054282367569762e-06, "loss": 1.38, "step": 11230 }, { "epoch": 0.562, "grad_norm": 1.9140625, "learning_rate": 4.046555873874961e-06, "loss": 1.4006, "step": 11240 }, { "epoch": 0.5625, "grad_norm": 2.09375, "learning_rate": 4.038831742147178e-06, "loss": 1.3921, "step": 11250 }, { "epoch": 0.563, "grad_norm": 1.9375, "learning_rate": 4.031109991521406e-06, "loss": 1.3846, "step": 11260 }, { "epoch": 0.5635, "grad_norm": 1.84375, "learning_rate": 4.023390641126738e-06, "loss": 1.3977, "step": 11270 }, { "epoch": 0.564, "grad_norm": 1.875, "learning_rate": 4.015673710086319e-06, "loss": 1.385, "step": 11280 }, { "epoch": 0.5645, "grad_norm": 1.75, "learning_rate": 4.0079592175173e-06, "loss": 1.3668, "step": 11290 }, { "epoch": 0.565, "grad_norm": 1.7421875, "learning_rate": 4.0002471825307946e-06, "loss": 1.4136, "step": 11300 }, { "epoch": 0.5655, "grad_norm": 1.9140625, "learning_rate": 3.992537624231824e-06, "loss": 1.3727, "step": 11310 }, { "epoch": 0.566, "grad_norm": 1.8671875, "learning_rate": 3.984830561719278e-06, "loss": 1.3854, "step": 11320 }, { "epoch": 0.5665, "grad_norm": 1.8671875, "learning_rate": 3.9771260140858605e-06, "loss": 1.3783, "step": 11330 }, { "epoch": 0.567, "grad_norm": 1.9453125, "learning_rate": 3.9694240004180475e-06, "loss": 1.3946, "step": 11340 }, { "epoch": 0.5675, "grad_norm": 1.9140625, "learning_rate": 3.961724539796034e-06, "loss": 1.3898, "step": 11350 }, { "epoch": 0.568, "grad_norm": 2.28125, "learning_rate": 3.954027651293698e-06, "loss": 1.378, "step": 11360 }, { "epoch": 0.5685, "grad_norm": 1.921875, "learning_rate": 3.946333353978534e-06, "loss": 1.3971, "step": 11370 }, { "epoch": 0.569, "grad_norm": 2.828125, "learning_rate": 3.9386416669116286e-06, "loss": 1.3969, "step": 11380 }, { "epoch": 0.5695, "grad_norm": 1.90625, "learning_rate": 3.930952609147594e-06, "loss": 1.3825, "step": 11390 }, { "epoch": 0.57, "grad_norm": 1.78125, "learning_rate": 3.923266199734537e-06, "loss": 1.375, "step": 11400 }, { "epoch": 0.5705, "grad_norm": 1.9140625, "learning_rate": 3.915582457713993e-06, "loss": 1.3807, "step": 11410 }, { "epoch": 0.571, "grad_norm": 1.8671875, "learning_rate": 3.907901402120899e-06, "loss": 1.3849, "step": 11420 }, { "epoch": 0.5715, "grad_norm": 1.890625, "learning_rate": 3.900223051983531e-06, "loss": 1.3804, "step": 11430 }, { "epoch": 0.572, "grad_norm": 1.984375, "learning_rate": 3.892547426323467e-06, "loss": 1.3877, "step": 11440 }, { "epoch": 0.5725, "grad_norm": 2.15625, "learning_rate": 3.884874544155531e-06, "loss": 1.3824, "step": 11450 }, { "epoch": 0.573, "grad_norm": 2.046875, "learning_rate": 3.877204424487754e-06, "loss": 1.3688, "step": 11460 }, { "epoch": 0.5735, "grad_norm": 1.765625, "learning_rate": 3.869537086321322e-06, "loss": 1.3702, "step": 11470 }, { "epoch": 0.574, "grad_norm": 2.109375, "learning_rate": 3.861872548650535e-06, "loss": 1.3934, "step": 11480 }, { "epoch": 0.5745, "grad_norm": 2.09375, "learning_rate": 3.854210830462745e-06, "loss": 1.3824, "step": 11490 }, { "epoch": 0.575, "grad_norm": 2.8125, "learning_rate": 3.84655195073833e-06, "loss": 1.3653, "step": 11500 }, { "epoch": 0.5755, "grad_norm": 1.875, "learning_rate": 3.83889592845063e-06, "loss": 1.3787, "step": 11510 }, { "epoch": 0.576, "grad_norm": 1.8125, "learning_rate": 3.83124278256591e-06, "loss": 1.3764, "step": 11520 }, { "epoch": 0.5765, "grad_norm": 1.859375, "learning_rate": 3.823592532043303e-06, "loss": 1.381, "step": 11530 }, { "epoch": 0.577, "grad_norm": 2.015625, "learning_rate": 3.81594519583478e-06, "loss": 1.3854, "step": 11540 }, { "epoch": 0.5775, "grad_norm": 2.09375, "learning_rate": 3.8083007928850803e-06, "loss": 1.3971, "step": 11550 }, { "epoch": 0.578, "grad_norm": 1.84375, "learning_rate": 3.800659342131686e-06, "loss": 1.3901, "step": 11560 }, { "epoch": 0.5785, "grad_norm": 1.859375, "learning_rate": 3.7930208625047605e-06, "loss": 1.3846, "step": 11570 }, { "epoch": 0.579, "grad_norm": 1.796875, "learning_rate": 3.7853853729271086e-06, "loss": 1.3867, "step": 11580 }, { "epoch": 0.5795, "grad_norm": 2.046875, "learning_rate": 3.777752892314127e-06, "loss": 1.3751, "step": 11590 }, { "epoch": 0.58, "grad_norm": 2.046875, "learning_rate": 3.770123439573764e-06, "loss": 1.4076, "step": 11600 }, { "epoch": 0.5805, "grad_norm": 2.0, "learning_rate": 3.7624970336064554e-06, "loss": 1.3968, "step": 11610 }, { "epoch": 0.581, "grad_norm": 2.09375, "learning_rate": 3.754873693305097e-06, "loss": 1.3819, "step": 11620 }, { "epoch": 0.5815, "grad_norm": 2.359375, "learning_rate": 3.7472534375549917e-06, "loss": 1.39, "step": 11630 }, { "epoch": 0.582, "grad_norm": 2.015625, "learning_rate": 3.7396362852337984e-06, "loss": 1.3764, "step": 11640 }, { "epoch": 0.5825, "grad_norm": 1.8828125, "learning_rate": 3.732022255211485e-06, "loss": 1.3784, "step": 11650 }, { "epoch": 0.583, "grad_norm": 2.1875, "learning_rate": 3.724411366350287e-06, "loss": 1.3815, "step": 11660 }, { "epoch": 0.5835, "grad_norm": 1.78125, "learning_rate": 3.7168036375046602e-06, "loss": 1.3733, "step": 11670 }, { "epoch": 0.584, "grad_norm": 1.9453125, "learning_rate": 3.7091990875212312e-06, "loss": 1.3819, "step": 11680 }, { "epoch": 0.5845, "grad_norm": 1.84375, "learning_rate": 3.701597735238748e-06, "loss": 1.3916, "step": 11690 }, { "epoch": 0.585, "grad_norm": 3.640625, "learning_rate": 3.6939995994880405e-06, "loss": 1.3772, "step": 11700 }, { "epoch": 0.5855, "grad_norm": 1.90625, "learning_rate": 3.6864046990919698e-06, "loss": 1.3904, "step": 11710 }, { "epoch": 0.586, "grad_norm": 1.875, "learning_rate": 3.6788130528653854e-06, "loss": 1.3773, "step": 11720 }, { "epoch": 0.5865, "grad_norm": 2.015625, "learning_rate": 3.6712246796150653e-06, "loss": 1.3809, "step": 11730 }, { "epoch": 0.587, "grad_norm": 2.125, "learning_rate": 3.663639598139688e-06, "loss": 1.3754, "step": 11740 }, { "epoch": 0.5875, "grad_norm": 1.953125, "learning_rate": 3.6560578272297785e-06, "loss": 1.3834, "step": 11750 }, { "epoch": 0.588, "grad_norm": 2.015625, "learning_rate": 3.648479385667651e-06, "loss": 1.3856, "step": 11760 }, { "epoch": 0.5885, "grad_norm": 1.8671875, "learning_rate": 3.640904292227382e-06, "loss": 1.3848, "step": 11770 }, { "epoch": 0.589, "grad_norm": 2.078125, "learning_rate": 3.6333325656747476e-06, "loss": 1.3909, "step": 11780 }, { "epoch": 0.5895, "grad_norm": 1.9765625, "learning_rate": 3.6257642247671877e-06, "loss": 1.3837, "step": 11790 }, { "epoch": 0.59, "grad_norm": 1.859375, "learning_rate": 3.6181992882537498e-06, "loss": 1.3904, "step": 11800 }, { "epoch": 0.5905, "grad_norm": 1.8046875, "learning_rate": 3.61063777487505e-06, "loss": 1.3775, "step": 11810 }, { "epoch": 0.591, "grad_norm": 1.8515625, "learning_rate": 3.603079703363226e-06, "loss": 1.379, "step": 11820 }, { "epoch": 0.5915, "grad_norm": 1.84375, "learning_rate": 3.5955250924418883e-06, "loss": 1.3647, "step": 11830 }, { "epoch": 0.592, "grad_norm": 2.03125, "learning_rate": 3.587973960826071e-06, "loss": 1.3804, "step": 11840 }, { "epoch": 0.5925, "grad_norm": 2.109375, "learning_rate": 3.5804263272221928e-06, "loss": 1.3877, "step": 11850 }, { "epoch": 0.593, "grad_norm": 1.84375, "learning_rate": 3.572882210328005e-06, "loss": 1.3742, "step": 11860 }, { "epoch": 0.5935, "grad_norm": 1.9140625, "learning_rate": 3.5653416288325506e-06, "loss": 1.3793, "step": 11870 }, { "epoch": 0.594, "grad_norm": 1.828125, "learning_rate": 3.557804601416106e-06, "loss": 1.3826, "step": 11880 }, { "epoch": 0.5945, "grad_norm": 1.9375, "learning_rate": 3.5502711467501504e-06, "loss": 1.3807, "step": 11890 }, { "epoch": 0.595, "grad_norm": 2.03125, "learning_rate": 3.5427412834973124e-06, "loss": 1.3923, "step": 11900 }, { "epoch": 0.5955, "grad_norm": 1.875, "learning_rate": 3.5352150303113203e-06, "loss": 1.3903, "step": 11910 }, { "epoch": 0.596, "grad_norm": 1.8359375, "learning_rate": 3.5276924058369588e-06, "loss": 1.3738, "step": 11920 }, { "epoch": 0.5965, "grad_norm": 1.90625, "learning_rate": 3.5201734287100253e-06, "loss": 1.3876, "step": 11930 }, { "epoch": 0.597, "grad_norm": 1.671875, "learning_rate": 3.512658117557281e-06, "loss": 1.3907, "step": 11940 }, { "epoch": 0.5975, "grad_norm": 1.9765625, "learning_rate": 3.5051464909964068e-06, "loss": 1.3887, "step": 11950 }, { "epoch": 0.598, "grad_norm": 2.0, "learning_rate": 3.4976385676359515e-06, "loss": 1.3886, "step": 11960 }, { "epoch": 0.5985, "grad_norm": 1.921875, "learning_rate": 3.490134366075294e-06, "loss": 1.3935, "step": 11970 }, { "epoch": 0.599, "grad_norm": 2.015625, "learning_rate": 3.4826339049045915e-06, "loss": 1.3933, "step": 11980 }, { "epoch": 0.5995, "grad_norm": 1.984375, "learning_rate": 3.475137202704739e-06, "loss": 1.3705, "step": 11990 }, { "epoch": 0.6, "grad_norm": 1.96875, "learning_rate": 3.4676442780473114e-06, "loss": 1.3784, "step": 12000 }, { "epoch": 0.6005, "grad_norm": 2.0, "learning_rate": 3.4601551494945306e-06, "loss": 1.3706, "step": 12010 }, { "epoch": 0.601, "grad_norm": 1.765625, "learning_rate": 3.452669835599216e-06, "loss": 1.3796, "step": 12020 }, { "epoch": 0.6015, "grad_norm": 1.921875, "learning_rate": 3.445188354904736e-06, "loss": 1.3908, "step": 12030 }, { "epoch": 0.602, "grad_norm": 1.8203125, "learning_rate": 3.437710725944956e-06, "loss": 1.388, "step": 12040 }, { "epoch": 0.6025, "grad_norm": 2.03125, "learning_rate": 3.4302369672442108e-06, "loss": 1.381, "step": 12050 }, { "epoch": 0.603, "grad_norm": 1.9375, "learning_rate": 3.42276709731724e-06, "loss": 1.3867, "step": 12060 }, { "epoch": 0.6035, "grad_norm": 2.15625, "learning_rate": 3.415301134669153e-06, "loss": 1.3926, "step": 12070 }, { "epoch": 0.604, "grad_norm": 2.34375, "learning_rate": 3.407839097795376e-06, "loss": 1.3894, "step": 12080 }, { "epoch": 0.6045, "grad_norm": 1.9921875, "learning_rate": 3.400381005181612e-06, "loss": 1.3602, "step": 12090 }, { "epoch": 0.605, "grad_norm": 1.921875, "learning_rate": 3.3929268753037946e-06, "loss": 1.3817, "step": 12100 }, { "epoch": 0.6055, "grad_norm": 1.9609375, "learning_rate": 3.3854767266280386e-06, "loss": 1.38, "step": 12110 }, { "epoch": 0.606, "grad_norm": 1.859375, "learning_rate": 3.3780305776105955e-06, "loss": 1.37, "step": 12120 }, { "epoch": 0.6065, "grad_norm": 2.046875, "learning_rate": 3.3705884466978084e-06, "loss": 1.3897, "step": 12130 }, { "epoch": 0.607, "grad_norm": 2.046875, "learning_rate": 3.36315035232607e-06, "loss": 1.3855, "step": 12140 }, { "epoch": 0.6075, "grad_norm": 2.234375, "learning_rate": 3.355716312921771e-06, "loss": 1.3887, "step": 12150 }, { "epoch": 0.608, "grad_norm": 1.859375, "learning_rate": 3.3482863469012524e-06, "loss": 1.386, "step": 12160 }, { "epoch": 0.6085, "grad_norm": 2.03125, "learning_rate": 3.3408604726707707e-06, "loss": 1.3834, "step": 12170 }, { "epoch": 0.609, "grad_norm": 2.15625, "learning_rate": 3.333438708626443e-06, "loss": 1.3657, "step": 12180 }, { "epoch": 0.6095, "grad_norm": 1.8203125, "learning_rate": 3.3260210731542063e-06, "loss": 1.3688, "step": 12190 }, { "epoch": 0.61, "grad_norm": 2.0, "learning_rate": 3.318607584629765e-06, "loss": 1.3802, "step": 12200 }, { "epoch": 0.6105, "grad_norm": 1.8359375, "learning_rate": 3.311198261418554e-06, "loss": 1.3892, "step": 12210 }, { "epoch": 0.611, "grad_norm": 2.0625, "learning_rate": 3.3037931218756897e-06, "loss": 1.4006, "step": 12220 }, { "epoch": 0.6115, "grad_norm": 2.046875, "learning_rate": 3.296392184345924e-06, "loss": 1.3898, "step": 12230 }, { "epoch": 0.612, "grad_norm": 1.8515625, "learning_rate": 3.288995467163597e-06, "loss": 1.3777, "step": 12240 }, { "epoch": 0.6125, "grad_norm": 1.859375, "learning_rate": 3.2816029886525947e-06, "loss": 1.3874, "step": 12250 }, { "epoch": 0.613, "grad_norm": 1.9453125, "learning_rate": 3.274214767126306e-06, "loss": 1.3845, "step": 12260 }, { "epoch": 0.6135, "grad_norm": 2.1875, "learning_rate": 3.266830820887571e-06, "loss": 1.3922, "step": 12270 }, { "epoch": 0.614, "grad_norm": 1.8671875, "learning_rate": 3.2594511682286343e-06, "loss": 1.3737, "step": 12280 }, { "epoch": 0.6145, "grad_norm": 2.15625, "learning_rate": 3.2520758274311138e-06, "loss": 1.3773, "step": 12290 }, { "epoch": 0.615, "grad_norm": 1.921875, "learning_rate": 3.2447048167659403e-06, "loss": 1.3826, "step": 12300 }, { "epoch": 0.6155, "grad_norm": 1.890625, "learning_rate": 3.237338154493312e-06, "loss": 1.3827, "step": 12310 }, { "epoch": 0.616, "grad_norm": 2.125, "learning_rate": 3.2299758588626657e-06, "loss": 1.4008, "step": 12320 }, { "epoch": 0.6165, "grad_norm": 2.84375, "learning_rate": 3.222617948112614e-06, "loss": 1.3876, "step": 12330 }, { "epoch": 0.617, "grad_norm": 1.9609375, "learning_rate": 3.215264440470909e-06, "loss": 1.3867, "step": 12340 }, { "epoch": 0.6175, "grad_norm": 1.765625, "learning_rate": 3.2079153541543927e-06, "loss": 1.3735, "step": 12350 }, { "epoch": 0.618, "grad_norm": 1.8671875, "learning_rate": 3.200570707368957e-06, "loss": 1.3755, "step": 12360 }, { "epoch": 0.6185, "grad_norm": 1.9296875, "learning_rate": 3.1932305183094937e-06, "loss": 1.3745, "step": 12370 }, { "epoch": 0.619, "grad_norm": 1.9453125, "learning_rate": 3.185894805159855e-06, "loss": 1.4057, "step": 12380 }, { "epoch": 0.6195, "grad_norm": 1.859375, "learning_rate": 3.1785635860927977e-06, "loss": 1.3872, "step": 12390 }, { "epoch": 0.62, "grad_norm": 2.03125, "learning_rate": 3.171236879269952e-06, "loss": 1.3746, "step": 12400 }, { "epoch": 0.6205, "grad_norm": 1.8515625, "learning_rate": 3.1639147028417683e-06, "loss": 1.4029, "step": 12410 }, { "epoch": 0.621, "grad_norm": 2.015625, "learning_rate": 3.1565970749474756e-06, "loss": 1.385, "step": 12420 }, { "epoch": 0.6215, "grad_norm": 1.984375, "learning_rate": 3.149284013715027e-06, "loss": 1.3698, "step": 12430 }, { "epoch": 0.622, "grad_norm": 1.859375, "learning_rate": 3.1419755372610727e-06, "loss": 1.3923, "step": 12440 }, { "epoch": 0.6225, "grad_norm": 2.046875, "learning_rate": 3.134671663690899e-06, "loss": 1.3861, "step": 12450 }, { "epoch": 0.623, "grad_norm": 1.9453125, "learning_rate": 3.127372411098393e-06, "loss": 1.3707, "step": 12460 }, { "epoch": 0.6235, "grad_norm": 1.9609375, "learning_rate": 3.120077797565989e-06, "loss": 1.376, "step": 12470 }, { "epoch": 0.624, "grad_norm": 2.015625, "learning_rate": 3.1127878411646335e-06, "loss": 1.3823, "step": 12480 }, { "epoch": 0.6245, "grad_norm": 1.90625, "learning_rate": 3.105502559953735e-06, "loss": 1.3886, "step": 12490 }, { "epoch": 0.625, "grad_norm": 1.8359375, "learning_rate": 3.0982219719811212e-06, "loss": 1.3751, "step": 12500 }, { "epoch": 0.6255, "grad_norm": 2.015625, "learning_rate": 3.090946095282989e-06, "loss": 1.4024, "step": 12510 }, { "epoch": 0.626, "grad_norm": 2.1875, "learning_rate": 3.083674947883868e-06, "loss": 1.392, "step": 12520 }, { "epoch": 0.6265, "grad_norm": 1.828125, "learning_rate": 3.0764085477965706e-06, "loss": 1.3838, "step": 12530 }, { "epoch": 0.627, "grad_norm": 1.953125, "learning_rate": 3.069146913022152e-06, "loss": 1.3654, "step": 12540 }, { "epoch": 0.6275, "grad_norm": 1.9375, "learning_rate": 3.0618900615498533e-06, "loss": 1.3694, "step": 12550 }, { "epoch": 0.628, "grad_norm": 1.9296875, "learning_rate": 3.0546380113570755e-06, "loss": 1.3848, "step": 12560 }, { "epoch": 0.6285, "grad_norm": 1.890625, "learning_rate": 3.0473907804093217e-06, "loss": 1.3945, "step": 12570 }, { "epoch": 0.629, "grad_norm": 2.015625, "learning_rate": 3.0401483866601566e-06, "loss": 1.3858, "step": 12580 }, { "epoch": 0.6295, "grad_norm": 2.03125, "learning_rate": 3.0329108480511593e-06, "loss": 1.3855, "step": 12590 }, { "epoch": 0.63, "grad_norm": 2.078125, "learning_rate": 3.025678182511885e-06, "loss": 1.3922, "step": 12600 }, { "epoch": 0.6305, "grad_norm": 2.375, "learning_rate": 3.018450407959814e-06, "loss": 1.3724, "step": 12610 }, { "epoch": 0.631, "grad_norm": 2.25, "learning_rate": 3.0112275423003137e-06, "loss": 1.3947, "step": 12620 }, { "epoch": 0.6315, "grad_norm": 1.9765625, "learning_rate": 3.004009603426584e-06, "loss": 1.3879, "step": 12630 }, { "epoch": 0.632, "grad_norm": 1.9765625, "learning_rate": 2.996796609219628e-06, "loss": 1.3913, "step": 12640 }, { "epoch": 0.6325, "grad_norm": 1.9296875, "learning_rate": 2.989588577548193e-06, "loss": 1.3848, "step": 12650 }, { "epoch": 0.633, "grad_norm": 1.8671875, "learning_rate": 2.982385526268739e-06, "loss": 1.3809, "step": 12660 }, { "epoch": 0.6335, "grad_norm": 2.0625, "learning_rate": 2.975187473225378e-06, "loss": 1.417, "step": 12670 }, { "epoch": 0.634, "grad_norm": 1.8984375, "learning_rate": 2.96799443624985e-06, "loss": 1.3817, "step": 12680 }, { "epoch": 0.6345, "grad_norm": 1.96875, "learning_rate": 2.960806433161464e-06, "loss": 1.3931, "step": 12690 }, { "epoch": 0.635, "grad_norm": 1.9765625, "learning_rate": 2.9536234817670594e-06, "loss": 1.3948, "step": 12700 }, { "epoch": 0.6355, "grad_norm": 1.9296875, "learning_rate": 2.9464455998609596e-06, "loss": 1.3918, "step": 12710 }, { "epoch": 0.636, "grad_norm": 2.0625, "learning_rate": 2.939272805224932e-06, "loss": 1.3866, "step": 12720 }, { "epoch": 0.6365, "grad_norm": 1.984375, "learning_rate": 2.9321051156281388e-06, "loss": 1.3772, "step": 12730 }, { "epoch": 0.637, "grad_norm": 1.6875, "learning_rate": 2.9249425488270975e-06, "loss": 1.3801, "step": 12740 }, { "epoch": 0.6375, "grad_norm": 2.140625, "learning_rate": 2.917785122565632e-06, "loss": 1.3889, "step": 12750 }, { "epoch": 0.638, "grad_norm": 2.1875, "learning_rate": 2.9106328545748373e-06, "loss": 1.3928, "step": 12760 }, { "epoch": 0.6385, "grad_norm": 1.9140625, "learning_rate": 2.9034857625730216e-06, "loss": 1.3814, "step": 12770 }, { "epoch": 0.639, "grad_norm": 2.0625, "learning_rate": 2.8963438642656787e-06, "loss": 1.3792, "step": 12780 }, { "epoch": 0.6395, "grad_norm": 1.984375, "learning_rate": 2.8892071773454318e-06, "loss": 1.3731, "step": 12790 }, { "epoch": 0.64, "grad_norm": 2.109375, "learning_rate": 2.8820757194919915e-06, "loss": 1.3827, "step": 12800 }, { "epoch": 0.6405, "grad_norm": 1.9765625, "learning_rate": 2.8749495083721213e-06, "loss": 1.3716, "step": 12810 }, { "epoch": 0.641, "grad_norm": 1.9765625, "learning_rate": 2.867828561639582e-06, "loss": 1.399, "step": 12820 }, { "epoch": 0.6415, "grad_norm": 2.03125, "learning_rate": 2.8607128969350915e-06, "loss": 1.387, "step": 12830 }, { "epoch": 0.642, "grad_norm": 1.84375, "learning_rate": 2.8536025318862875e-06, "loss": 1.3651, "step": 12840 }, { "epoch": 0.6425, "grad_norm": 1.8203125, "learning_rate": 2.8464974841076803e-06, "loss": 1.3821, "step": 12850 }, { "epoch": 0.643, "grad_norm": 2.046875, "learning_rate": 2.839397771200596e-06, "loss": 1.3866, "step": 12860 }, { "epoch": 0.6435, "grad_norm": 2.09375, "learning_rate": 2.8323034107531566e-06, "loss": 1.3938, "step": 12870 }, { "epoch": 0.644, "grad_norm": 2.015625, "learning_rate": 2.825214420340222e-06, "loss": 1.387, "step": 12880 }, { "epoch": 0.6445, "grad_norm": 1.765625, "learning_rate": 2.8181308175233457e-06, "loss": 1.3883, "step": 12890 }, { "epoch": 0.645, "grad_norm": 2.09375, "learning_rate": 2.811052619850734e-06, "loss": 1.3947, "step": 12900 }, { "epoch": 0.6455, "grad_norm": 2.359375, "learning_rate": 2.8039798448572087e-06, "loss": 1.3896, "step": 12910 }, { "epoch": 0.646, "grad_norm": 1.8359375, "learning_rate": 2.7969125100641514e-06, "loss": 1.3868, "step": 12920 }, { "epoch": 0.6465, "grad_norm": 1.8046875, "learning_rate": 2.789850632979473e-06, "loss": 1.3736, "step": 12930 }, { "epoch": 0.647, "grad_norm": 2.125, "learning_rate": 2.7827942310975598e-06, "loss": 1.39, "step": 12940 }, { "epoch": 0.6475, "grad_norm": 1.9296875, "learning_rate": 2.7757433218992312e-06, "loss": 1.3737, "step": 12950 }, { "epoch": 0.648, "grad_norm": 1.890625, "learning_rate": 2.7686979228517096e-06, "loss": 1.3849, "step": 12960 }, { "epoch": 0.6485, "grad_norm": 1.9296875, "learning_rate": 2.761658051408559e-06, "loss": 1.3875, "step": 12970 }, { "epoch": 0.649, "grad_norm": 1.890625, "learning_rate": 2.7546237250096497e-06, "loss": 1.3725, "step": 12980 }, { "epoch": 0.6495, "grad_norm": 1.9609375, "learning_rate": 2.74759496108112e-06, "loss": 1.3951, "step": 12990 }, { "epoch": 0.65, "grad_norm": 2.109375, "learning_rate": 2.740571777035327e-06, "loss": 1.378, "step": 13000 }, { "epoch": 0.6505, "grad_norm": 1.9140625, "learning_rate": 2.733554190270803e-06, "loss": 1.3788, "step": 13010 }, { "epoch": 0.651, "grad_norm": 1.8671875, "learning_rate": 2.7265422181722117e-06, "loss": 1.3856, "step": 13020 }, { "epoch": 0.6515, "grad_norm": 2.078125, "learning_rate": 2.7195358781103153e-06, "loss": 1.389, "step": 13030 }, { "epoch": 0.652, "grad_norm": 1.890625, "learning_rate": 2.712535187441916e-06, "loss": 1.3864, "step": 13040 }, { "epoch": 0.6525, "grad_norm": 2.015625, "learning_rate": 2.7055401635098265e-06, "loss": 1.3806, "step": 13050 }, { "epoch": 0.653, "grad_norm": 1.8046875, "learning_rate": 2.698550823642817e-06, "loss": 1.4131, "step": 13060 }, { "epoch": 0.6535, "grad_norm": 1.8203125, "learning_rate": 2.691567185155578e-06, "loss": 1.3819, "step": 13070 }, { "epoch": 0.654, "grad_norm": 1.7734375, "learning_rate": 2.684589265348678e-06, "loss": 1.3836, "step": 13080 }, { "epoch": 0.6545, "grad_norm": 1.984375, "learning_rate": 2.6776170815085162e-06, "loss": 1.3692, "step": 13090 }, { "epoch": 0.655, "grad_norm": 2.03125, "learning_rate": 2.6706506509072803e-06, "loss": 1.3866, "step": 13100 }, { "epoch": 0.6555, "grad_norm": 2.046875, "learning_rate": 2.663689990802912e-06, "loss": 1.3756, "step": 13110 }, { "epoch": 0.656, "grad_norm": 1.8671875, "learning_rate": 2.656735118439051e-06, "loss": 1.3933, "step": 13120 }, { "epoch": 0.6565, "grad_norm": 1.984375, "learning_rate": 2.6497860510450045e-06, "loss": 1.376, "step": 13130 }, { "epoch": 0.657, "grad_norm": 1.890625, "learning_rate": 2.6428428058356925e-06, "loss": 1.3759, "step": 13140 }, { "epoch": 0.6575, "grad_norm": 1.9609375, "learning_rate": 2.635905400011619e-06, "loss": 1.3957, "step": 13150 }, { "epoch": 0.658, "grad_norm": 1.890625, "learning_rate": 2.6289738507588157e-06, "loss": 1.3827, "step": 13160 }, { "epoch": 0.6585, "grad_norm": 1.8984375, "learning_rate": 2.6220481752488125e-06, "loss": 1.3866, "step": 13170 }, { "epoch": 0.659, "grad_norm": 1.8984375, "learning_rate": 2.6151283906385816e-06, "loss": 1.3701, "step": 13180 }, { "epoch": 0.6595, "grad_norm": 2.0625, "learning_rate": 2.608214514070504e-06, "loss": 1.3772, "step": 13190 }, { "epoch": 0.66, "grad_norm": 1.890625, "learning_rate": 2.601306562672329e-06, "loss": 1.3931, "step": 13200 }, { "epoch": 0.6605, "grad_norm": 2.140625, "learning_rate": 2.5944045535571207e-06, "loss": 1.3849, "step": 13210 }, { "epoch": 0.661, "grad_norm": 1.9375, "learning_rate": 2.587508503823224e-06, "loss": 1.3754, "step": 13220 }, { "epoch": 0.6615, "grad_norm": 2.203125, "learning_rate": 2.5806184305542264e-06, "loss": 1.3719, "step": 13230 }, { "epoch": 0.662, "grad_norm": 1.8046875, "learning_rate": 2.5737343508189016e-06, "loss": 1.3907, "step": 13240 }, { "epoch": 0.6625, "grad_norm": 2.484375, "learning_rate": 2.566856281671183e-06, "loss": 1.388, "step": 13250 }, { "epoch": 0.663, "grad_norm": 1.828125, "learning_rate": 2.5599842401501074e-06, "loss": 1.38, "step": 13260 }, { "epoch": 0.6635, "grad_norm": 1.8984375, "learning_rate": 2.553118243279785e-06, "loss": 1.3764, "step": 13270 }, { "epoch": 0.664, "grad_norm": 2.0, "learning_rate": 2.5462583080693466e-06, "loss": 1.3934, "step": 13280 }, { "epoch": 0.6645, "grad_norm": 2.015625, "learning_rate": 2.539404451512912e-06, "loss": 1.3898, "step": 13290 }, { "epoch": 0.665, "grad_norm": 1.84375, "learning_rate": 2.532556690589537e-06, "loss": 1.372, "step": 13300 }, { "epoch": 0.6655, "grad_norm": 1.9375, "learning_rate": 2.525715042263178e-06, "loss": 1.3901, "step": 13310 }, { "epoch": 0.666, "grad_norm": 2.0625, "learning_rate": 2.5188795234826526e-06, "loss": 1.3823, "step": 13320 }, { "epoch": 0.6665, "grad_norm": 1.8515625, "learning_rate": 2.512050151181589e-06, "loss": 1.3921, "step": 13330 }, { "epoch": 0.667, "grad_norm": 1.7890625, "learning_rate": 2.5052269422783882e-06, "loss": 1.4044, "step": 13340 }, { "epoch": 0.6675, "grad_norm": 1.9375, "learning_rate": 2.4984099136761893e-06, "loss": 1.4011, "step": 13350 }, { "epoch": 0.668, "grad_norm": 1.8984375, "learning_rate": 2.4915990822628117e-06, "loss": 1.3945, "step": 13360 }, { "epoch": 0.6685, "grad_norm": 2.109375, "learning_rate": 2.484794464910732e-06, "loss": 1.4034, "step": 13370 }, { "epoch": 0.669, "grad_norm": 2.0625, "learning_rate": 2.477996078477026e-06, "loss": 1.3815, "step": 13380 }, { "epoch": 0.6695, "grad_norm": 1.96875, "learning_rate": 2.471203939803332e-06, "loss": 1.3962, "step": 13390 }, { "epoch": 0.67, "grad_norm": 1.96875, "learning_rate": 2.464418065715819e-06, "loss": 1.3844, "step": 13400 }, { "epoch": 0.6705, "grad_norm": 1.7734375, "learning_rate": 2.457638473025132e-06, "loss": 1.3862, "step": 13410 }, { "epoch": 0.671, "grad_norm": 2.15625, "learning_rate": 2.4508651785263533e-06, "loss": 1.3853, "step": 13420 }, { "epoch": 0.6715, "grad_norm": 2.140625, "learning_rate": 2.4440981989989636e-06, "loss": 1.3875, "step": 13430 }, { "epoch": 0.672, "grad_norm": 2.078125, "learning_rate": 2.437337551206804e-06, "loss": 1.3761, "step": 13440 }, { "epoch": 0.6725, "grad_norm": 1.875, "learning_rate": 2.4305832518980243e-06, "loss": 1.3631, "step": 13450 }, { "epoch": 0.673, "grad_norm": 1.71875, "learning_rate": 2.4238353178050473e-06, "loss": 1.3617, "step": 13460 }, { "epoch": 0.6735, "grad_norm": 2.015625, "learning_rate": 2.417093765644534e-06, "loss": 1.3698, "step": 13470 }, { "epoch": 0.674, "grad_norm": 1.921875, "learning_rate": 2.4103586121173277e-06, "loss": 1.3714, "step": 13480 }, { "epoch": 0.6745, "grad_norm": 2.046875, "learning_rate": 2.4036298739084225e-06, "loss": 1.392, "step": 13490 }, { "epoch": 0.675, "grad_norm": 2.03125, "learning_rate": 2.3969075676869237e-06, "loss": 1.3956, "step": 13500 }, { "epoch": 0.6755, "grad_norm": 1.7265625, "learning_rate": 2.3901917101059966e-06, "loss": 1.3785, "step": 13510 }, { "epoch": 0.676, "grad_norm": 1.8203125, "learning_rate": 2.3834823178028384e-06, "loss": 1.3798, "step": 13520 }, { "epoch": 0.6765, "grad_norm": 1.84375, "learning_rate": 2.376779407398621e-06, "loss": 1.397, "step": 13530 }, { "epoch": 0.677, "grad_norm": 2.03125, "learning_rate": 2.3700829954984685e-06, "loss": 1.3854, "step": 13540 }, { "epoch": 0.6775, "grad_norm": 2.015625, "learning_rate": 2.363393098691397e-06, "loss": 1.3957, "step": 13550 }, { "epoch": 0.678, "grad_norm": 1.8984375, "learning_rate": 2.3567097335502905e-06, "loss": 1.3884, "step": 13560 }, { "epoch": 0.6785, "grad_norm": 1.796875, "learning_rate": 2.350032916631847e-06, "loss": 1.3793, "step": 13570 }, { "epoch": 0.679, "grad_norm": 1.8359375, "learning_rate": 2.3433626644765424e-06, "loss": 1.3896, "step": 13580 }, { "epoch": 0.6795, "grad_norm": 1.84375, "learning_rate": 2.3366989936085958e-06, "loss": 1.3767, "step": 13590 }, { "epoch": 0.68, "grad_norm": 1.90625, "learning_rate": 2.330041920535916e-06, "loss": 1.3898, "step": 13600 }, { "epoch": 0.6805, "grad_norm": 1.9453125, "learning_rate": 2.3233914617500675e-06, "loss": 1.3705, "step": 13610 }, { "epoch": 0.681, "grad_norm": 1.9140625, "learning_rate": 2.3167476337262347e-06, "loss": 1.3756, "step": 13620 }, { "epoch": 0.6815, "grad_norm": 2.125, "learning_rate": 2.3101104529231676e-06, "loss": 1.388, "step": 13630 }, { "epoch": 0.682, "grad_norm": 1.9296875, "learning_rate": 2.303479935783159e-06, "loss": 1.3679, "step": 13640 }, { "epoch": 0.6825, "grad_norm": 2.265625, "learning_rate": 2.2968560987319845e-06, "loss": 1.3863, "step": 13650 }, { "epoch": 0.683, "grad_norm": 2.109375, "learning_rate": 2.2902389581788736e-06, "loss": 1.3677, "step": 13660 }, { "epoch": 0.6835, "grad_norm": 2.109375, "learning_rate": 2.2836285305164703e-06, "loss": 1.3819, "step": 13670 }, { "epoch": 0.684, "grad_norm": 1.96875, "learning_rate": 2.277024832120787e-06, "loss": 1.3782, "step": 13680 }, { "epoch": 0.6845, "grad_norm": 2.0625, "learning_rate": 2.270427879351164e-06, "loss": 1.4052, "step": 13690 }, { "epoch": 0.685, "grad_norm": 1.859375, "learning_rate": 2.2638376885502294e-06, "loss": 1.3748, "step": 13700 }, { "epoch": 0.6855, "grad_norm": 2.0, "learning_rate": 2.2572542760438654e-06, "loss": 1.3845, "step": 13710 }, { "epoch": 0.686, "grad_norm": 2.0625, "learning_rate": 2.2506776581411584e-06, "loss": 1.3972, "step": 13720 }, { "epoch": 0.6865, "grad_norm": 2.21875, "learning_rate": 2.2441078511343594e-06, "loss": 1.3641, "step": 13730 }, { "epoch": 0.687, "grad_norm": 1.859375, "learning_rate": 2.237544871298856e-06, "loss": 1.3694, "step": 13740 }, { "epoch": 0.6875, "grad_norm": 1.8984375, "learning_rate": 2.230988734893113e-06, "loss": 1.391, "step": 13750 }, { "epoch": 0.688, "grad_norm": 1.984375, "learning_rate": 2.2244394581586497e-06, "loss": 1.3748, "step": 13760 }, { "epoch": 0.6885, "grad_norm": 2.125, "learning_rate": 2.2178970573199877e-06, "loss": 1.382, "step": 13770 }, { "epoch": 0.689, "grad_norm": 1.8203125, "learning_rate": 2.2113615485846124e-06, "loss": 1.4016, "step": 13780 }, { "epoch": 0.6895, "grad_norm": 2.03125, "learning_rate": 2.204832948142942e-06, "loss": 1.3785, "step": 13790 }, { "epoch": 0.69, "grad_norm": 1.8046875, "learning_rate": 2.198311272168281e-06, "loss": 1.3771, "step": 13800 }, { "epoch": 0.6905, "grad_norm": 1.875, "learning_rate": 2.191796536816769e-06, "loss": 1.3668, "step": 13810 }, { "epoch": 0.691, "grad_norm": 1.984375, "learning_rate": 2.1852887582273624e-06, "loss": 1.3667, "step": 13820 }, { "epoch": 0.6915, "grad_norm": 2.046875, "learning_rate": 2.1787879525217826e-06, "loss": 1.3735, "step": 13830 }, { "epoch": 0.692, "grad_norm": 1.9140625, "learning_rate": 2.172294135804473e-06, "loss": 1.3891, "step": 13840 }, { "epoch": 0.6925, "grad_norm": 2.328125, "learning_rate": 2.1658073241625634e-06, "loss": 1.3834, "step": 13850 }, { "epoch": 0.693, "grad_norm": 2.046875, "learning_rate": 2.159327533665835e-06, "loss": 1.3777, "step": 13860 }, { "epoch": 0.6935, "grad_norm": 1.9296875, "learning_rate": 2.152854780366668e-06, "loss": 1.3815, "step": 13870 }, { "epoch": 0.694, "grad_norm": 1.921875, "learning_rate": 2.1463890803000166e-06, "loss": 1.3701, "step": 13880 }, { "epoch": 0.6945, "grad_norm": 1.875, "learning_rate": 2.1399304494833584e-06, "loss": 1.3786, "step": 13890 }, { "epoch": 0.695, "grad_norm": 1.9140625, "learning_rate": 2.1334789039166555e-06, "loss": 1.3847, "step": 13900 }, { "epoch": 0.6955, "grad_norm": 2.140625, "learning_rate": 2.127034459582324e-06, "loss": 1.3885, "step": 13910 }, { "epoch": 0.696, "grad_norm": 1.7578125, "learning_rate": 2.120597132445188e-06, "loss": 1.3674, "step": 13920 }, { "epoch": 0.6965, "grad_norm": 1.7734375, "learning_rate": 2.11416693845243e-06, "loss": 1.3917, "step": 13930 }, { "epoch": 0.697, "grad_norm": 2.140625, "learning_rate": 2.107743893533573e-06, "loss": 1.3922, "step": 13940 }, { "epoch": 0.6975, "grad_norm": 1.921875, "learning_rate": 2.1013280136004265e-06, "loss": 1.3782, "step": 13950 }, { "epoch": 0.698, "grad_norm": 1.8515625, "learning_rate": 2.0949193145470474e-06, "loss": 1.3836, "step": 13960 }, { "epoch": 0.6985, "grad_norm": 1.9375, "learning_rate": 2.0885178122497043e-06, "loss": 1.3811, "step": 13970 }, { "epoch": 0.699, "grad_norm": 1.8828125, "learning_rate": 2.0821235225668403e-06, "loss": 1.3789, "step": 13980 }, { "epoch": 0.6995, "grad_norm": 1.984375, "learning_rate": 2.0757364613390274e-06, "loss": 1.3816, "step": 13990 }, { "epoch": 0.7, "grad_norm": 2.0625, "learning_rate": 2.0693566443889295e-06, "loss": 1.3795, "step": 14000 }, { "epoch": 0.7005, "grad_norm": 2.15625, "learning_rate": 2.0629840875212704e-06, "loss": 1.3788, "step": 14010 }, { "epoch": 0.701, "grad_norm": 2.03125, "learning_rate": 2.0566188065227804e-06, "loss": 1.3906, "step": 14020 }, { "epoch": 0.7015, "grad_norm": 1.9609375, "learning_rate": 2.0502608171621735e-06, "loss": 1.373, "step": 14030 }, { "epoch": 0.702, "grad_norm": 1.90625, "learning_rate": 2.0439101351900937e-06, "loss": 1.3772, "step": 14040 }, { "epoch": 0.7025, "grad_norm": 1.859375, "learning_rate": 2.0375667763390833e-06, "loss": 1.3845, "step": 14050 }, { "epoch": 0.703, "grad_norm": 1.7265625, "learning_rate": 2.0312307563235457e-06, "loss": 1.3994, "step": 14060 }, { "epoch": 0.7035, "grad_norm": 1.9609375, "learning_rate": 2.0249020908397066e-06, "loss": 1.3848, "step": 14070 }, { "epoch": 0.704, "grad_norm": 1.8515625, "learning_rate": 2.0185807955655604e-06, "loss": 1.3777, "step": 14080 }, { "epoch": 0.7045, "grad_norm": 2.109375, "learning_rate": 2.0122668861608564e-06, "loss": 1.3851, "step": 14090 }, { "epoch": 0.705, "grad_norm": 1.8046875, "learning_rate": 2.005960378267041e-06, "loss": 1.389, "step": 14100 }, { "epoch": 0.7055, "grad_norm": 1.890625, "learning_rate": 1.9996612875072257e-06, "loss": 1.3687, "step": 14110 }, { "epoch": 0.706, "grad_norm": 2.015625, "learning_rate": 1.9933696294861445e-06, "loss": 1.3963, "step": 14120 }, { "epoch": 0.7065, "grad_norm": 2.109375, "learning_rate": 1.9870854197901248e-06, "loss": 1.3735, "step": 14130 }, { "epoch": 0.707, "grad_norm": 2.0625, "learning_rate": 1.9808086739870345e-06, "loss": 1.3974, "step": 14140 }, { "epoch": 0.7075, "grad_norm": 1.8984375, "learning_rate": 1.9745394076262604e-06, "loss": 1.3922, "step": 14150 }, { "epoch": 0.708, "grad_norm": 1.9296875, "learning_rate": 1.968277636238651e-06, "loss": 1.3833, "step": 14160 }, { "epoch": 0.7085, "grad_norm": 2.03125, "learning_rate": 1.9620233753364925e-06, "loss": 1.3943, "step": 14170 }, { "epoch": 0.709, "grad_norm": 2.046875, "learning_rate": 1.9557766404134655e-06, "loss": 1.383, "step": 14180 }, { "epoch": 0.7095, "grad_norm": 1.828125, "learning_rate": 1.949537446944609e-06, "loss": 1.3894, "step": 14190 }, { "epoch": 0.71, "grad_norm": 1.9453125, "learning_rate": 1.943305810386269e-06, "loss": 1.3493, "step": 14200 }, { "epoch": 0.7105, "grad_norm": 1.828125, "learning_rate": 1.937081746176082e-06, "loss": 1.3692, "step": 14210 }, { "epoch": 0.711, "grad_norm": 2.0, "learning_rate": 1.9308652697329233e-06, "loss": 1.4124, "step": 14220 }, { "epoch": 0.7115, "grad_norm": 2.1875, "learning_rate": 1.924656396456867e-06, "loss": 1.3838, "step": 14230 }, { "epoch": 0.712, "grad_norm": 2.546875, "learning_rate": 1.9184551417291532e-06, "loss": 1.3916, "step": 14240 }, { "epoch": 0.7125, "grad_norm": 2.1875, "learning_rate": 1.9122615209121525e-06, "loss": 1.3827, "step": 14250 }, { "epoch": 0.713, "grad_norm": 2.015625, "learning_rate": 1.9060755493493172e-06, "loss": 1.3819, "step": 14260 }, { "epoch": 0.7135, "grad_norm": 2.0, "learning_rate": 1.899897242365158e-06, "loss": 1.3897, "step": 14270 }, { "epoch": 0.714, "grad_norm": 1.71875, "learning_rate": 1.8937266152651917e-06, "loss": 1.3838, "step": 14280 }, { "epoch": 0.7145, "grad_norm": 1.828125, "learning_rate": 1.8875636833359113e-06, "loss": 1.368, "step": 14290 }, { "epoch": 0.715, "grad_norm": 1.9921875, "learning_rate": 1.881408461844748e-06, "loss": 1.4064, "step": 14300 }, { "epoch": 0.7155, "grad_norm": 1.7890625, "learning_rate": 1.8752609660400356e-06, "loss": 1.3921, "step": 14310 }, { "epoch": 0.716, "grad_norm": 1.8984375, "learning_rate": 1.8691212111509571e-06, "loss": 1.386, "step": 14320 }, { "epoch": 0.7165, "grad_norm": 1.9140625, "learning_rate": 1.8629892123875303e-06, "loss": 1.3775, "step": 14330 }, { "epoch": 0.717, "grad_norm": 2.046875, "learning_rate": 1.8568649849405572e-06, "loss": 1.3894, "step": 14340 }, { "epoch": 0.7175, "grad_norm": 1.890625, "learning_rate": 1.8507485439815836e-06, "loss": 1.3849, "step": 14350 }, { "epoch": 0.718, "grad_norm": 2.0625, "learning_rate": 1.8446399046628655e-06, "loss": 1.3902, "step": 14360 }, { "epoch": 0.7185, "grad_norm": 1.8515625, "learning_rate": 1.8385390821173394e-06, "loss": 1.374, "step": 14370 }, { "epoch": 0.719, "grad_norm": 2.296875, "learning_rate": 1.8324460914585674e-06, "loss": 1.3901, "step": 14380 }, { "epoch": 0.7195, "grad_norm": 1.953125, "learning_rate": 1.8263609477807186e-06, "loss": 1.3677, "step": 14390 }, { "epoch": 0.72, "grad_norm": 1.875, "learning_rate": 1.8202836661585165e-06, "loss": 1.4028, "step": 14400 }, { "epoch": 0.7205, "grad_norm": 2.046875, "learning_rate": 1.8142142616472097e-06, "loss": 1.3695, "step": 14410 }, { "epoch": 0.721, "grad_norm": 2.578125, "learning_rate": 1.8081527492825335e-06, "loss": 1.3805, "step": 14420 }, { "epoch": 0.7215, "grad_norm": 1.8828125, "learning_rate": 1.8020991440806762e-06, "loss": 1.4048, "step": 14430 }, { "epoch": 0.722, "grad_norm": 1.828125, "learning_rate": 1.7960534610382259e-06, "loss": 1.3867, "step": 14440 }, { "epoch": 0.7225, "grad_norm": 2.328125, "learning_rate": 1.7900157151321557e-06, "loss": 1.3941, "step": 14450 }, { "epoch": 0.723, "grad_norm": 1.875, "learning_rate": 1.7839859213197753e-06, "loss": 1.3843, "step": 14460 }, { "epoch": 0.7235, "grad_norm": 2.015625, "learning_rate": 1.7779640945386894e-06, "loss": 1.4136, "step": 14470 }, { "epoch": 0.724, "grad_norm": 2.0, "learning_rate": 1.771950249706768e-06, "loss": 1.3747, "step": 14480 }, { "epoch": 0.7245, "grad_norm": 2.03125, "learning_rate": 1.7659444017221116e-06, "loss": 1.3743, "step": 14490 }, { "epoch": 0.725, "grad_norm": 2.09375, "learning_rate": 1.7599465654630027e-06, "loss": 1.3745, "step": 14500 }, { "epoch": 0.7255, "grad_norm": 1.8671875, "learning_rate": 1.753956755787884e-06, "loss": 1.3951, "step": 14510 }, { "epoch": 0.726, "grad_norm": 1.8515625, "learning_rate": 1.7479749875353086e-06, "loss": 1.362, "step": 14520 }, { "epoch": 0.7265, "grad_norm": 1.9296875, "learning_rate": 1.742001275523909e-06, "loss": 1.4095, "step": 14530 }, { "epoch": 0.727, "grad_norm": 1.8984375, "learning_rate": 1.7360356345523638e-06, "loss": 1.398, "step": 14540 }, { "epoch": 0.7275, "grad_norm": 2.109375, "learning_rate": 1.7300780793993538e-06, "loss": 1.3791, "step": 14550 }, { "epoch": 0.728, "grad_norm": 1.921875, "learning_rate": 1.7241286248235274e-06, "loss": 1.3828, "step": 14560 }, { "epoch": 0.7285, "grad_norm": 1.8515625, "learning_rate": 1.7181872855634697e-06, "loss": 1.3893, "step": 14570 }, { "epoch": 0.729, "grad_norm": 1.9140625, "learning_rate": 1.7122540763376633e-06, "loss": 1.3792, "step": 14580 }, { "epoch": 0.7295, "grad_norm": 2.09375, "learning_rate": 1.7063290118444393e-06, "loss": 1.4058, "step": 14590 }, { "epoch": 0.73, "grad_norm": 1.9375, "learning_rate": 1.7004121067619632e-06, "loss": 1.374, "step": 14600 }, { "epoch": 0.7305, "grad_norm": 2.0625, "learning_rate": 1.694503375748185e-06, "loss": 1.3807, "step": 14610 }, { "epoch": 0.731, "grad_norm": 2.03125, "learning_rate": 1.688602833440801e-06, "loss": 1.3731, "step": 14620 }, { "epoch": 0.7315, "grad_norm": 1.96875, "learning_rate": 1.6827104944572226e-06, "loss": 1.4004, "step": 14630 }, { "epoch": 0.732, "grad_norm": 1.9140625, "learning_rate": 1.6768263733945444e-06, "loss": 1.3824, "step": 14640 }, { "epoch": 0.7325, "grad_norm": 1.8984375, "learning_rate": 1.670950484829495e-06, "loss": 1.3866, "step": 14650 }, { "epoch": 0.733, "grad_norm": 1.8515625, "learning_rate": 1.6650828433184156e-06, "loss": 1.3596, "step": 14660 }, { "epoch": 0.7335, "grad_norm": 1.984375, "learning_rate": 1.6592234633972127e-06, "loss": 1.376, "step": 14670 }, { "epoch": 0.734, "grad_norm": 2.15625, "learning_rate": 1.653372359581325e-06, "loss": 1.393, "step": 14680 }, { "epoch": 0.7345, "grad_norm": 2.015625, "learning_rate": 1.6475295463656937e-06, "loss": 1.3836, "step": 14690 }, { "epoch": 0.735, "grad_norm": 1.8671875, "learning_rate": 1.6416950382247222e-06, "loss": 1.3881, "step": 14700 }, { "epoch": 0.7355, "grad_norm": 1.921875, "learning_rate": 1.63586884961223e-06, "loss": 1.3817, "step": 14710 }, { "epoch": 0.736, "grad_norm": 1.96875, "learning_rate": 1.6300509949614374e-06, "loss": 1.3759, "step": 14720 }, { "epoch": 0.7365, "grad_norm": 1.859375, "learning_rate": 1.6242414886849161e-06, "loss": 1.3829, "step": 14730 }, { "epoch": 0.737, "grad_norm": 1.8828125, "learning_rate": 1.6184403451745546e-06, "loss": 1.3739, "step": 14740 }, { "epoch": 0.7375, "grad_norm": 2.015625, "learning_rate": 1.6126475788015223e-06, "loss": 1.3762, "step": 14750 }, { "epoch": 0.738, "grad_norm": 2.015625, "learning_rate": 1.606863203916242e-06, "loss": 1.3839, "step": 14760 }, { "epoch": 0.7385, "grad_norm": 1.859375, "learning_rate": 1.6010872348483425e-06, "loss": 1.3862, "step": 14770 }, { "epoch": 0.739, "grad_norm": 1.9609375, "learning_rate": 1.5953196859066333e-06, "loss": 1.3962, "step": 14780 }, { "epoch": 0.7395, "grad_norm": 1.890625, "learning_rate": 1.5895605713790623e-06, "loss": 1.3838, "step": 14790 }, { "epoch": 0.74, "grad_norm": 1.9609375, "learning_rate": 1.583809905532681e-06, "loss": 1.3682, "step": 14800 }, { "epoch": 0.7405, "grad_norm": 1.9609375, "learning_rate": 1.5780677026136154e-06, "loss": 1.4007, "step": 14810 }, { "epoch": 0.741, "grad_norm": 1.7421875, "learning_rate": 1.5723339768470264e-06, "loss": 1.3762, "step": 14820 }, { "epoch": 0.7415, "grad_norm": 2.03125, "learning_rate": 1.5666087424370668e-06, "loss": 1.3819, "step": 14830 }, { "epoch": 0.742, "grad_norm": 1.796875, "learning_rate": 1.560892013566861e-06, "loss": 1.3756, "step": 14840 }, { "epoch": 0.7425, "grad_norm": 2.0, "learning_rate": 1.5551838043984634e-06, "loss": 1.3715, "step": 14850 }, { "epoch": 0.743, "grad_norm": 1.9453125, "learning_rate": 1.549484129072818e-06, "loss": 1.3807, "step": 14860 }, { "epoch": 0.7435, "grad_norm": 1.890625, "learning_rate": 1.5437930017097286e-06, "loss": 1.3898, "step": 14870 }, { "epoch": 0.744, "grad_norm": 1.921875, "learning_rate": 1.5381104364078263e-06, "loss": 1.4003, "step": 14880 }, { "epoch": 0.7445, "grad_norm": 2.0625, "learning_rate": 1.532436447244528e-06, "loss": 1.3664, "step": 14890 }, { "epoch": 0.745, "grad_norm": 1.8203125, "learning_rate": 1.5267710482760084e-06, "loss": 1.3851, "step": 14900 }, { "epoch": 0.7455, "grad_norm": 1.96875, "learning_rate": 1.521114253537158e-06, "loss": 1.3782, "step": 14910 }, { "epoch": 0.746, "grad_norm": 1.9375, "learning_rate": 1.5154660770415531e-06, "loss": 1.3783, "step": 14920 }, { "epoch": 0.7465, "grad_norm": 2.078125, "learning_rate": 1.509826532781421e-06, "loss": 1.3831, "step": 14930 }, { "epoch": 0.747, "grad_norm": 1.921875, "learning_rate": 1.5041956347276083e-06, "loss": 1.3822, "step": 14940 }, { "epoch": 0.7475, "grad_norm": 1.7890625, "learning_rate": 1.4985733968295313e-06, "loss": 1.395, "step": 14950 }, { "epoch": 0.748, "grad_norm": 1.8671875, "learning_rate": 1.4929598330151617e-06, "loss": 1.3816, "step": 14960 }, { "epoch": 0.7485, "grad_norm": 1.8671875, "learning_rate": 1.487354957190984e-06, "loss": 1.3886, "step": 14970 }, { "epoch": 0.749, "grad_norm": 1.8984375, "learning_rate": 1.4817587832419527e-06, "loss": 1.3961, "step": 14980 }, { "epoch": 0.7495, "grad_norm": 1.8671875, "learning_rate": 1.4761713250314685e-06, "loss": 1.3988, "step": 14990 }, { "epoch": 0.75, "grad_norm": 2.15625, "learning_rate": 1.470592596401344e-06, "loss": 1.3921, "step": 15000 }, { "epoch": 0.7505, "grad_norm": 1.671875, "learning_rate": 1.4650226111717598e-06, "loss": 1.3606, "step": 15010 }, { "epoch": 0.751, "grad_norm": 2.046875, "learning_rate": 1.4594613831412419e-06, "loss": 1.3794, "step": 15020 }, { "epoch": 0.7515, "grad_norm": 1.8515625, "learning_rate": 1.4539089260866196e-06, "loss": 1.3761, "step": 15030 }, { "epoch": 0.752, "grad_norm": 2.328125, "learning_rate": 1.4483652537629923e-06, "loss": 1.3793, "step": 15040 }, { "epoch": 0.7525, "grad_norm": 1.859375, "learning_rate": 1.4428303799037009e-06, "loss": 1.3787, "step": 15050 }, { "epoch": 0.753, "grad_norm": 2.046875, "learning_rate": 1.4373043182202868e-06, "loss": 1.3867, "step": 15060 }, { "epoch": 0.7535, "grad_norm": 2.125, "learning_rate": 1.431787082402461e-06, "loss": 1.3872, "step": 15070 }, { "epoch": 0.754, "grad_norm": 1.9921875, "learning_rate": 1.4262786861180717e-06, "loss": 1.3928, "step": 15080 }, { "epoch": 0.7545, "grad_norm": 1.9765625, "learning_rate": 1.420779143013072e-06, "loss": 1.3875, "step": 15090 }, { "epoch": 0.755, "grad_norm": 2.125, "learning_rate": 1.415288466711473e-06, "loss": 1.3847, "step": 15100 }, { "epoch": 0.7555, "grad_norm": 2.078125, "learning_rate": 1.4098066708153297e-06, "loss": 1.4014, "step": 15110 }, { "epoch": 0.756, "grad_norm": 1.9140625, "learning_rate": 1.404333768904696e-06, "loss": 1.379, "step": 15120 }, { "epoch": 0.7565, "grad_norm": 2.078125, "learning_rate": 1.3988697745375895e-06, "loss": 1.3915, "step": 15130 }, { "epoch": 0.757, "grad_norm": 1.890625, "learning_rate": 1.3934147012499617e-06, "loss": 1.375, "step": 15140 }, { "epoch": 0.7575, "grad_norm": 1.8515625, "learning_rate": 1.3879685625556671e-06, "loss": 1.3808, "step": 15150 }, { "epoch": 0.758, "grad_norm": 1.9296875, "learning_rate": 1.3825313719464218e-06, "loss": 1.4046, "step": 15160 }, { "epoch": 0.7585, "grad_norm": 2.09375, "learning_rate": 1.3771031428917807e-06, "loss": 1.3834, "step": 15170 }, { "epoch": 0.759, "grad_norm": 2.015625, "learning_rate": 1.3716838888390938e-06, "loss": 1.3641, "step": 15180 }, { "epoch": 0.7595, "grad_norm": 1.8828125, "learning_rate": 1.3662736232134765e-06, "loss": 1.3789, "step": 15190 }, { "epoch": 0.76, "grad_norm": 1.71875, "learning_rate": 1.3608723594177803e-06, "loss": 1.3773, "step": 15200 }, { "epoch": 0.7605, "grad_norm": 1.890625, "learning_rate": 1.3554801108325593e-06, "loss": 1.3765, "step": 15210 }, { "epoch": 0.761, "grad_norm": 2.03125, "learning_rate": 1.3500968908160239e-06, "loss": 1.3794, "step": 15220 }, { "epoch": 0.7615, "grad_norm": 1.8828125, "learning_rate": 1.3447227127040273e-06, "loss": 1.3763, "step": 15230 }, { "epoch": 0.762, "grad_norm": 1.875, "learning_rate": 1.3393575898100219e-06, "loss": 1.3749, "step": 15240 }, { "epoch": 0.7625, "grad_norm": 1.84375, "learning_rate": 1.334001535425024e-06, "loss": 1.3721, "step": 15250 }, { "epoch": 0.763, "grad_norm": 1.8359375, "learning_rate": 1.3286545628175863e-06, "loss": 1.384, "step": 15260 }, { "epoch": 0.7635, "grad_norm": 2.109375, "learning_rate": 1.3233166852337647e-06, "loss": 1.3893, "step": 15270 }, { "epoch": 0.764, "grad_norm": 1.9765625, "learning_rate": 1.3179879158970816e-06, "loss": 1.3843, "step": 15280 }, { "epoch": 0.7645, "grad_norm": 2.234375, "learning_rate": 1.3126682680084979e-06, "loss": 1.3863, "step": 15290 }, { "epoch": 0.765, "grad_norm": 2.0, "learning_rate": 1.3073577547463756e-06, "loss": 1.3875, "step": 15300 }, { "epoch": 0.7655, "grad_norm": 1.984375, "learning_rate": 1.3020563892664478e-06, "loss": 1.3688, "step": 15310 }, { "epoch": 0.766, "grad_norm": 2.140625, "learning_rate": 1.2967641847017891e-06, "loss": 1.3782, "step": 15320 }, { "epoch": 0.7665, "grad_norm": 2.0625, "learning_rate": 1.2914811541627759e-06, "loss": 1.3775, "step": 15330 }, { "epoch": 0.767, "grad_norm": 1.921875, "learning_rate": 1.286207310737057e-06, "loss": 1.3899, "step": 15340 }, { "epoch": 0.7675, "grad_norm": 1.8828125, "learning_rate": 1.280942667489527e-06, "loss": 1.3739, "step": 15350 }, { "epoch": 0.768, "grad_norm": 1.953125, "learning_rate": 1.275687237462286e-06, "loss": 1.3878, "step": 15360 }, { "epoch": 0.7685, "grad_norm": 1.984375, "learning_rate": 1.2704410336746093e-06, "loss": 1.3907, "step": 15370 }, { "epoch": 0.769, "grad_norm": 2.015625, "learning_rate": 1.265204069122915e-06, "loss": 1.4001, "step": 15380 }, { "epoch": 0.7695, "grad_norm": 2.171875, "learning_rate": 1.2599763567807389e-06, "loss": 1.3626, "step": 15390 }, { "epoch": 0.77, "grad_norm": 2.296875, "learning_rate": 1.254757909598688e-06, "loss": 1.3668, "step": 15400 }, { "epoch": 0.7705, "grad_norm": 1.9765625, "learning_rate": 1.2495487405044243e-06, "loss": 1.3781, "step": 15410 }, { "epoch": 0.771, "grad_norm": 1.9375, "learning_rate": 1.2443488624026207e-06, "loss": 1.3845, "step": 15420 }, { "epoch": 0.7715, "grad_norm": 1.9375, "learning_rate": 1.239158288174932e-06, "loss": 1.3752, "step": 15430 }, { "epoch": 0.772, "grad_norm": 1.984375, "learning_rate": 1.2339770306799714e-06, "loss": 1.385, "step": 15440 }, { "epoch": 0.7725, "grad_norm": 2.09375, "learning_rate": 1.228805102753265e-06, "loss": 1.387, "step": 15450 }, { "epoch": 0.773, "grad_norm": 1.8359375, "learning_rate": 1.2236425172072285e-06, "loss": 1.3887, "step": 15460 }, { "epoch": 0.7735, "grad_norm": 1.8359375, "learning_rate": 1.2184892868311366e-06, "loss": 1.3785, "step": 15470 }, { "epoch": 0.774, "grad_norm": 1.859375, "learning_rate": 1.2133454243910841e-06, "loss": 1.3746, "step": 15480 }, { "epoch": 0.7745, "grad_norm": 2.0, "learning_rate": 1.2082109426299632e-06, "loss": 1.3887, "step": 15490 }, { "epoch": 0.775, "grad_norm": 1.9296875, "learning_rate": 1.2030858542674228e-06, "loss": 1.3804, "step": 15500 }, { "epoch": 0.7755, "grad_norm": 2.8125, "learning_rate": 1.1979701719998454e-06, "loss": 1.4019, "step": 15510 }, { "epoch": 0.776, "grad_norm": 1.8828125, "learning_rate": 1.1928639085003085e-06, "loss": 1.3741, "step": 15520 }, { "epoch": 0.7765, "grad_norm": 1.796875, "learning_rate": 1.18776707641856e-06, "loss": 1.385, "step": 15530 }, { "epoch": 0.777, "grad_norm": 1.953125, "learning_rate": 1.1826796883809805e-06, "loss": 1.3821, "step": 15540 }, { "epoch": 0.7775, "grad_norm": 1.90625, "learning_rate": 1.177601756990554e-06, "loss": 1.3833, "step": 15550 }, { "epoch": 0.778, "grad_norm": 2.5625, "learning_rate": 1.1725332948268425e-06, "loss": 1.3686, "step": 15560 }, { "epoch": 0.7785, "grad_norm": 2.0, "learning_rate": 1.1674743144459445e-06, "loss": 1.3859, "step": 15570 }, { "epoch": 0.779, "grad_norm": 2.0, "learning_rate": 1.1624248283804713e-06, "loss": 1.4034, "step": 15580 }, { "epoch": 0.7795, "grad_norm": 1.921875, "learning_rate": 1.1573848491395162e-06, "loss": 1.3827, "step": 15590 }, { "epoch": 0.78, "grad_norm": 1.984375, "learning_rate": 1.1523543892086164e-06, "loss": 1.3597, "step": 15600 }, { "epoch": 0.7805, "grad_norm": 2.125, "learning_rate": 1.1473334610497329e-06, "loss": 1.3801, "step": 15610 }, { "epoch": 0.781, "grad_norm": 1.796875, "learning_rate": 1.1423220771012067e-06, "loss": 1.4079, "step": 15620 }, { "epoch": 0.7815, "grad_norm": 1.8671875, "learning_rate": 1.1373202497777424e-06, "loss": 1.3667, "step": 15630 }, { "epoch": 0.782, "grad_norm": 1.8984375, "learning_rate": 1.132327991470365e-06, "loss": 1.3696, "step": 15640 }, { "epoch": 0.7825, "grad_norm": 2.03125, "learning_rate": 1.127345314546393e-06, "loss": 1.4002, "step": 15650 }, { "epoch": 0.783, "grad_norm": 2.25, "learning_rate": 1.122372231349415e-06, "loss": 1.3778, "step": 15660 }, { "epoch": 0.7835, "grad_norm": 1.9453125, "learning_rate": 1.1174087541992462e-06, "loss": 1.3764, "step": 15670 }, { "epoch": 0.784, "grad_norm": 2.140625, "learning_rate": 1.1124548953919106e-06, "loss": 1.3872, "step": 15680 }, { "epoch": 0.7845, "grad_norm": 2.171875, "learning_rate": 1.107510667199601e-06, "loss": 1.3837, "step": 15690 }, { "epoch": 0.785, "grad_norm": 1.96875, "learning_rate": 1.102576081870652e-06, "loss": 1.3735, "step": 15700 }, { "epoch": 0.7855, "grad_norm": 2.09375, "learning_rate": 1.097651151629514e-06, "loss": 1.3757, "step": 15710 }, { "epoch": 0.786, "grad_norm": 1.7734375, "learning_rate": 1.0927358886767142e-06, "loss": 1.3861, "step": 15720 }, { "epoch": 0.7865, "grad_norm": 1.9140625, "learning_rate": 1.0878303051888324e-06, "loss": 1.389, "step": 15730 }, { "epoch": 0.787, "grad_norm": 2.0, "learning_rate": 1.0829344133184715e-06, "loss": 1.399, "step": 15740 }, { "epoch": 0.7875, "grad_norm": 2.015625, "learning_rate": 1.0780482251942226e-06, "loss": 1.3846, "step": 15750 }, { "epoch": 0.788, "grad_norm": 1.8515625, "learning_rate": 1.0731717529206403e-06, "loss": 1.3789, "step": 15760 }, { "epoch": 0.7885, "grad_norm": 1.890625, "learning_rate": 1.068305008578207e-06, "loss": 1.371, "step": 15770 }, { "epoch": 0.789, "grad_norm": 2.03125, "learning_rate": 1.0634480042233102e-06, "loss": 1.3762, "step": 15780 }, { "epoch": 0.7895, "grad_norm": 1.890625, "learning_rate": 1.058600751888204e-06, "loss": 1.3989, "step": 15790 }, { "epoch": 0.79, "grad_norm": 1.8125, "learning_rate": 1.053763263580988e-06, "loss": 1.3853, "step": 15800 }, { "epoch": 0.7905, "grad_norm": 2.0625, "learning_rate": 1.048935551285571e-06, "loss": 1.3879, "step": 15810 }, { "epoch": 0.791, "grad_norm": 2.1875, "learning_rate": 1.0441176269616415e-06, "loss": 1.3745, "step": 15820 }, { "epoch": 0.7915, "grad_norm": 1.90625, "learning_rate": 1.0393095025446465e-06, "loss": 1.3786, "step": 15830 }, { "epoch": 0.792, "grad_norm": 2.046875, "learning_rate": 1.03451118994575e-06, "loss": 1.4016, "step": 15840 }, { "epoch": 0.7925, "grad_norm": 2.203125, "learning_rate": 1.0297227010518106e-06, "loss": 1.3666, "step": 15850 }, { "epoch": 0.793, "grad_norm": 1.890625, "learning_rate": 1.0249440477253536e-06, "loss": 1.3931, "step": 15860 }, { "epoch": 0.7935, "grad_norm": 1.90625, "learning_rate": 1.0201752418045336e-06, "loss": 1.3687, "step": 15870 }, { "epoch": 0.794, "grad_norm": 1.9296875, "learning_rate": 1.015416295103116e-06, "loss": 1.3941, "step": 15880 }, { "epoch": 0.7945, "grad_norm": 2.0625, "learning_rate": 1.0106672194104356e-06, "loss": 1.3739, "step": 15890 }, { "epoch": 0.795, "grad_norm": 1.9375, "learning_rate": 1.0059280264913807e-06, "loss": 1.3761, "step": 15900 }, { "epoch": 0.7955, "grad_norm": 1.8984375, "learning_rate": 1.0011987280863494e-06, "loss": 1.3782, "step": 15910 }, { "epoch": 0.796, "grad_norm": 1.8125, "learning_rate": 9.96479335911235e-07, "loss": 1.3767, "step": 15920 }, { "epoch": 0.7965, "grad_norm": 2.09375, "learning_rate": 9.917698616573874e-07, "loss": 1.4025, "step": 15930 }, { "epoch": 0.797, "grad_norm": 2.09375, "learning_rate": 9.870703169915829e-07, "loss": 1.3781, "step": 15940 }, { "epoch": 0.7975, "grad_norm": 1.8515625, "learning_rate": 9.823807135560076e-07, "loss": 1.375, "step": 15950 }, { "epoch": 0.798, "grad_norm": 2.0625, "learning_rate": 9.777010629682133e-07, "loss": 1.4006, "step": 15960 }, { "epoch": 0.7985, "grad_norm": 1.9765625, "learning_rate": 9.730313768210963e-07, "loss": 1.3914, "step": 15970 }, { "epoch": 0.799, "grad_norm": 1.9609375, "learning_rate": 9.68371666682873e-07, "loss": 1.3896, "step": 15980 }, { "epoch": 0.7995, "grad_norm": 1.921875, "learning_rate": 9.63721944097039e-07, "loss": 1.3676, "step": 15990 }, { "epoch": 0.8, "grad_norm": 2.234375, "learning_rate": 9.590822205823552e-07, "loss": 1.4104, "step": 16000 }, { "epoch": 0.8005, "grad_norm": 1.9921875, "learning_rate": 9.544525076328066e-07, "loss": 1.3867, "step": 16010 }, { "epoch": 0.801, "grad_norm": 2.078125, "learning_rate": 9.498328167175785e-07, "loss": 1.4119, "step": 16020 }, { "epoch": 0.8015, "grad_norm": 2.0, "learning_rate": 9.452231592810329e-07, "loss": 1.3828, "step": 16030 }, { "epoch": 0.802, "grad_norm": 1.859375, "learning_rate": 9.406235467426744e-07, "loss": 1.3742, "step": 16040 }, { "epoch": 0.8025, "grad_norm": 1.8671875, "learning_rate": 9.360339904971211e-07, "loss": 1.3768, "step": 16050 }, { "epoch": 0.803, "grad_norm": 2.234375, "learning_rate": 9.314545019140791e-07, "loss": 1.3865, "step": 16060 }, { "epoch": 0.8035, "grad_norm": 1.8671875, "learning_rate": 9.268850923383183e-07, "loss": 1.3729, "step": 16070 }, { "epoch": 0.804, "grad_norm": 1.96875, "learning_rate": 9.22325773089634e-07, "loss": 1.3932, "step": 16080 }, { "epoch": 0.8045, "grad_norm": 1.890625, "learning_rate": 9.17776555462826e-07, "loss": 1.4065, "step": 16090 }, { "epoch": 0.805, "grad_norm": 2.546875, "learning_rate": 9.132374507276743e-07, "loss": 1.3761, "step": 16100 }, { "epoch": 0.8055, "grad_norm": 1.8984375, "learning_rate": 9.087084701288984e-07, "loss": 1.3855, "step": 16110 }, { "epoch": 0.806, "grad_norm": 2.21875, "learning_rate": 9.041896248861453e-07, "loss": 1.3873, "step": 16120 }, { "epoch": 0.8065, "grad_norm": 1.890625, "learning_rate": 8.996809261939482e-07, "loss": 1.4039, "step": 16130 }, { "epoch": 0.807, "grad_norm": 1.8515625, "learning_rate": 8.95182385221704e-07, "loss": 1.3919, "step": 16140 }, { "epoch": 0.8075, "grad_norm": 2.09375, "learning_rate": 8.906940131136494e-07, "loss": 1.3776, "step": 16150 }, { "epoch": 0.808, "grad_norm": 2.0, "learning_rate": 8.862158209888294e-07, "loss": 1.3971, "step": 16160 }, { "epoch": 0.8085, "grad_norm": 1.7890625, "learning_rate": 8.817478199410673e-07, "loss": 1.3799, "step": 16170 }, { "epoch": 0.809, "grad_norm": 1.7890625, "learning_rate": 8.772900210389401e-07, "loss": 1.3834, "step": 16180 }, { "epoch": 0.8095, "grad_norm": 2.109375, "learning_rate": 8.728424353257559e-07, "loss": 1.3751, "step": 16190 }, { "epoch": 0.81, "grad_norm": 2.109375, "learning_rate": 8.684050738195166e-07, "loss": 1.3845, "step": 16200 }, { "epoch": 0.8105, "grad_norm": 1.8984375, "learning_rate": 8.639779475128951e-07, "loss": 1.3897, "step": 16210 }, { "epoch": 0.811, "grad_norm": 1.9453125, "learning_rate": 8.595610673732152e-07, "loss": 1.3796, "step": 16220 }, { "epoch": 0.8115, "grad_norm": 1.90625, "learning_rate": 8.551544443424103e-07, "loss": 1.3851, "step": 16230 }, { "epoch": 0.812, "grad_norm": 1.9921875, "learning_rate": 8.507580893370076e-07, "loss": 1.3842, "step": 16240 }, { "epoch": 0.8125, "grad_norm": 1.859375, "learning_rate": 8.463720132480985e-07, "loss": 1.3694, "step": 16250 }, { "epoch": 0.813, "grad_norm": 1.953125, "learning_rate": 8.419962269413062e-07, "loss": 1.376, "step": 16260 }, { "epoch": 0.8135, "grad_norm": 2.015625, "learning_rate": 8.376307412567686e-07, "loss": 1.3856, "step": 16270 }, { "epoch": 0.814, "grad_norm": 1.84375, "learning_rate": 8.332755670091009e-07, "loss": 1.3793, "step": 16280 }, { "epoch": 0.8145, "grad_norm": 2.09375, "learning_rate": 8.289307149873749e-07, "loss": 1.3722, "step": 16290 }, { "epoch": 0.815, "grad_norm": 1.796875, "learning_rate": 8.245961959550936e-07, "loss": 1.353, "step": 16300 }, { "epoch": 0.8155, "grad_norm": 1.9453125, "learning_rate": 8.202720206501614e-07, "loss": 1.3931, "step": 16310 }, { "epoch": 0.816, "grad_norm": 2.265625, "learning_rate": 8.159581997848553e-07, "loss": 1.3763, "step": 16320 }, { "epoch": 0.8165, "grad_norm": 2.0, "learning_rate": 8.116547440458034e-07, "loss": 1.3972, "step": 16330 }, { "epoch": 0.817, "grad_norm": 2.125, "learning_rate": 8.073616640939574e-07, "loss": 1.3768, "step": 16340 }, { "epoch": 0.8175, "grad_norm": 1.8828125, "learning_rate": 8.030789705645631e-07, "loss": 1.383, "step": 16350 }, { "epoch": 0.818, "grad_norm": 2.0625, "learning_rate": 7.988066740671351e-07, "loss": 1.3754, "step": 16360 }, { "epoch": 0.8185, "grad_norm": 2.03125, "learning_rate": 7.94544785185436e-07, "loss": 1.4001, "step": 16370 }, { "epoch": 0.819, "grad_norm": 1.921875, "learning_rate": 7.902933144774399e-07, "loss": 1.3745, "step": 16380 }, { "epoch": 0.8195, "grad_norm": 3.390625, "learning_rate": 7.860522724753167e-07, "loss": 1.3733, "step": 16390 }, { "epoch": 0.82, "grad_norm": 1.78125, "learning_rate": 7.818216696853986e-07, "loss": 1.3738, "step": 16400 }, { "epoch": 0.8205, "grad_norm": 2.1875, "learning_rate": 7.776015165881556e-07, "loss": 1.3898, "step": 16410 }, { "epoch": 0.821, "grad_norm": 1.9140625, "learning_rate": 7.733918236381744e-07, "loss": 1.3771, "step": 16420 }, { "epoch": 0.8215, "grad_norm": 1.71875, "learning_rate": 7.691926012641287e-07, "loss": 1.376, "step": 16430 }, { "epoch": 0.822, "grad_norm": 1.921875, "learning_rate": 7.650038598687471e-07, "loss": 1.3717, "step": 16440 }, { "epoch": 0.8225, "grad_norm": 1.8828125, "learning_rate": 7.608256098288003e-07, "loss": 1.3695, "step": 16450 }, { "epoch": 0.823, "grad_norm": 2.1875, "learning_rate": 7.566578614950681e-07, "loss": 1.3844, "step": 16460 }, { "epoch": 0.8235, "grad_norm": 1.9140625, "learning_rate": 7.525006251923117e-07, "loss": 1.3779, "step": 16470 }, { "epoch": 0.824, "grad_norm": 2.015625, "learning_rate": 7.483539112192506e-07, "loss": 1.3655, "step": 16480 }, { "epoch": 0.8245, "grad_norm": 2.21875, "learning_rate": 7.442177298485415e-07, "loss": 1.3913, "step": 16490 }, { "epoch": 0.825, "grad_norm": 2.09375, "learning_rate": 7.400920913267434e-07, "loss": 1.3907, "step": 16500 }, { "epoch": 0.8255, "grad_norm": 2.875, "learning_rate": 7.359770058743027e-07, "loss": 1.3722, "step": 16510 }, { "epoch": 0.826, "grad_norm": 1.78125, "learning_rate": 7.31872483685519e-07, "loss": 1.3762, "step": 16520 }, { "epoch": 0.8265, "grad_norm": 2.046875, "learning_rate": 7.277785349285232e-07, "loss": 1.3815, "step": 16530 }, { "epoch": 0.827, "grad_norm": 2.25, "learning_rate": 7.236951697452555e-07, "loss": 1.3716, "step": 16540 }, { "epoch": 0.8275, "grad_norm": 2.0625, "learning_rate": 7.196223982514383e-07, "loss": 1.3817, "step": 16550 }, { "epoch": 0.828, "grad_norm": 2.140625, "learning_rate": 7.155602305365433e-07, "loss": 1.3692, "step": 16560 }, { "epoch": 0.8285, "grad_norm": 1.8828125, "learning_rate": 7.115086766637796e-07, "loss": 1.3852, "step": 16570 }, { "epoch": 0.829, "grad_norm": 1.90625, "learning_rate": 7.074677466700636e-07, "loss": 1.373, "step": 16580 }, { "epoch": 0.8295, "grad_norm": 3.078125, "learning_rate": 7.034374505659891e-07, "loss": 1.3893, "step": 16590 }, { "epoch": 0.83, "grad_norm": 1.8125, "learning_rate": 6.994177983358063e-07, "loss": 1.366, "step": 16600 }, { "epoch": 0.8305, "grad_norm": 1.8203125, "learning_rate": 6.95408799937402e-07, "loss": 1.3954, "step": 16610 }, { "epoch": 0.831, "grad_norm": 2.0, "learning_rate": 6.914104653022657e-07, "loss": 1.3809, "step": 16620 }, { "epoch": 0.8315, "grad_norm": 2.0, "learning_rate": 6.874228043354725e-07, "loss": 1.3757, "step": 16630 }, { "epoch": 0.832, "grad_norm": 5.65625, "learning_rate": 6.834458269156547e-07, "loss": 1.3814, "step": 16640 }, { "epoch": 0.8325, "grad_norm": 2.015625, "learning_rate": 6.794795428949751e-07, "loss": 1.4012, "step": 16650 }, { "epoch": 0.833, "grad_norm": 2.1875, "learning_rate": 6.75523962099111e-07, "loss": 1.392, "step": 16660 }, { "epoch": 0.8335, "grad_norm": 2.21875, "learning_rate": 6.71579094327225e-07, "loss": 1.3836, "step": 16670 }, { "epoch": 0.834, "grad_norm": 2.0, "learning_rate": 6.67644949351931e-07, "loss": 1.3616, "step": 16680 }, { "epoch": 0.8345, "grad_norm": 1.828125, "learning_rate": 6.637215369192901e-07, "loss": 1.3737, "step": 16690 }, { "epoch": 0.835, "grad_norm": 1.9609375, "learning_rate": 6.598088667487717e-07, "loss": 1.3932, "step": 16700 }, { "epoch": 0.8355, "grad_norm": 1.9375, "learning_rate": 6.559069485332326e-07, "loss": 1.3835, "step": 16710 }, { "epoch": 0.836, "grad_norm": 2.09375, "learning_rate": 6.520157919388926e-07, "loss": 1.3707, "step": 16720 }, { "epoch": 0.8365, "grad_norm": 37.75, "learning_rate": 6.481354066053153e-07, "loss": 1.3983, "step": 16730 }, { "epoch": 0.837, "grad_norm": 1.9765625, "learning_rate": 6.442658021453791e-07, "loss": 1.4004, "step": 16740 }, { "epoch": 0.8375, "grad_norm": 1.9375, "learning_rate": 6.404069881452524e-07, "loss": 1.3851, "step": 16750 }, { "epoch": 0.838, "grad_norm": 2.046875, "learning_rate": 6.365589741643768e-07, "loss": 1.3916, "step": 16760 }, { "epoch": 0.8385, "grad_norm": 2.109375, "learning_rate": 6.32721769735436e-07, "loss": 1.3983, "step": 16770 }, { "epoch": 0.839, "grad_norm": 2.015625, "learning_rate": 6.288953843643369e-07, "loss": 1.3892, "step": 16780 }, { "epoch": 0.8395, "grad_norm": 2.515625, "learning_rate": 6.250798275301828e-07, "loss": 1.3785, "step": 16790 }, { "epoch": 0.84, "grad_norm": 2.28125, "learning_rate": 6.21275108685252e-07, "loss": 1.3895, "step": 16800 }, { "epoch": 0.8405, "grad_norm": 1.8984375, "learning_rate": 6.174812372549738e-07, "loss": 1.3764, "step": 16810 }, { "epoch": 0.841, "grad_norm": 2.078125, "learning_rate": 6.136982226379079e-07, "loss": 1.391, "step": 16820 }, { "epoch": 0.8415, "grad_norm": 1.7890625, "learning_rate": 6.099260742057122e-07, "loss": 1.3714, "step": 16830 }, { "epoch": 0.842, "grad_norm": 2.03125, "learning_rate": 6.061648013031318e-07, "loss": 1.3993, "step": 16840 }, { "epoch": 0.8425, "grad_norm": 2.640625, "learning_rate": 6.024144132479675e-07, "loss": 1.3853, "step": 16850 }, { "epoch": 0.843, "grad_norm": 1.828125, "learning_rate": 5.986749193310559e-07, "loss": 1.3741, "step": 16860 }, { "epoch": 0.8435, "grad_norm": 1.8515625, "learning_rate": 5.949463288162422e-07, "loss": 1.3866, "step": 16870 }, { "epoch": 0.844, "grad_norm": 1.9375, "learning_rate": 5.912286509403658e-07, "loss": 1.3921, "step": 16880 }, { "epoch": 0.8445, "grad_norm": 1.953125, "learning_rate": 5.875218949132284e-07, "loss": 1.3986, "step": 16890 }, { "epoch": 0.845, "grad_norm": 2.1875, "learning_rate": 5.838260699175769e-07, "loss": 1.3956, "step": 16900 }, { "epoch": 0.8455, "grad_norm": 1.75, "learning_rate": 5.801411851090782e-07, "loss": 1.3602, "step": 16910 }, { "epoch": 0.846, "grad_norm": 2.09375, "learning_rate": 5.76467249616296e-07, "loss": 1.3855, "step": 16920 }, { "epoch": 0.8465, "grad_norm": 1.84375, "learning_rate": 5.728042725406702e-07, "loss": 1.3726, "step": 16930 }, { "epoch": 0.847, "grad_norm": 1.8515625, "learning_rate": 5.691522629564966e-07, "loss": 1.3646, "step": 16940 }, { "epoch": 0.8475, "grad_norm": 2.015625, "learning_rate": 5.655112299108939e-07, "loss": 1.3837, "step": 16950 }, { "epoch": 0.848, "grad_norm": 2.515625, "learning_rate": 5.618811824237941e-07, "loss": 1.3796, "step": 16960 }, { "epoch": 0.8485, "grad_norm": 1.875, "learning_rate": 5.582621294879154e-07, "loss": 1.3731, "step": 16970 }, { "epoch": 0.849, "grad_norm": 1.9765625, "learning_rate": 5.546540800687356e-07, "loss": 1.3881, "step": 16980 }, { "epoch": 0.8495, "grad_norm": 2.0625, "learning_rate": 5.510570431044742e-07, "loss": 1.3759, "step": 16990 }, { "epoch": 0.85, "grad_norm": 1.953125, "learning_rate": 5.474710275060724e-07, "loss": 1.3811, "step": 17000 }, { "epoch": 0.8505, "grad_norm": 2.03125, "learning_rate": 5.438960421571642e-07, "loss": 1.3636, "step": 17010 }, { "epoch": 0.851, "grad_norm": 1.921875, "learning_rate": 5.403320959140629e-07, "loss": 1.391, "step": 17020 }, { "epoch": 0.8515, "grad_norm": 2.0, "learning_rate": 5.367791976057307e-07, "loss": 1.3915, "step": 17030 }, { "epoch": 0.852, "grad_norm": 2.03125, "learning_rate": 5.332373560337606e-07, "loss": 1.3819, "step": 17040 }, { "epoch": 0.8525, "grad_norm": 2.046875, "learning_rate": 5.297065799723583e-07, "loss": 1.386, "step": 17050 }, { "epoch": 0.853, "grad_norm": 2.125, "learning_rate": 5.26186878168316e-07, "loss": 1.3712, "step": 17060 }, { "epoch": 0.8535, "grad_norm": 1.9296875, "learning_rate": 5.226782593409874e-07, "loss": 1.3892, "step": 17070 }, { "epoch": 0.854, "grad_norm": 1.890625, "learning_rate": 5.191807321822751e-07, "loss": 1.3924, "step": 17080 }, { "epoch": 0.8545, "grad_norm": 2.09375, "learning_rate": 5.156943053566038e-07, "loss": 1.3946, "step": 17090 }, { "epoch": 0.855, "grad_norm": 1.796875, "learning_rate": 5.122189875008971e-07, "loss": 1.3956, "step": 17100 }, { "epoch": 0.8555, "grad_norm": 2.1875, "learning_rate": 5.08754787224559e-07, "loss": 1.3901, "step": 17110 }, { "epoch": 0.856, "grad_norm": 1.8984375, "learning_rate": 5.053017131094534e-07, "loss": 1.3771, "step": 17120 }, { "epoch": 0.8565, "grad_norm": 2.046875, "learning_rate": 5.018597737098796e-07, "loss": 1.3895, "step": 17130 }, { "epoch": 0.857, "grad_norm": 1.90625, "learning_rate": 4.984289775525547e-07, "loss": 1.3834, "step": 17140 }, { "epoch": 0.8575, "grad_norm": 2.03125, "learning_rate": 4.95009333136589e-07, "loss": 1.3754, "step": 17150 }, { "epoch": 0.858, "grad_norm": 2.0625, "learning_rate": 4.916008489334667e-07, "loss": 1.3742, "step": 17160 }, { "epoch": 0.8585, "grad_norm": 1.8671875, "learning_rate": 4.882035333870255e-07, "loss": 1.3675, "step": 17170 }, { "epoch": 0.859, "grad_norm": 1.90625, "learning_rate": 4.848173949134371e-07, "loss": 1.3799, "step": 17180 }, { "epoch": 0.8595, "grad_norm": 2.0625, "learning_rate": 4.814424419011782e-07, "loss": 1.3847, "step": 17190 }, { "epoch": 0.86, "grad_norm": 1.9140625, "learning_rate": 4.780786827110213e-07, "loss": 1.3924, "step": 17200 }, { "epoch": 0.8605, "grad_norm": 1.875, "learning_rate": 4.747261256760077e-07, "loss": 1.3887, "step": 17210 }, { "epoch": 0.861, "grad_norm": 1.9609375, "learning_rate": 4.713847791014253e-07, "loss": 1.3983, "step": 17220 }, { "epoch": 0.8615, "grad_norm": 1.90625, "learning_rate": 4.6805465126479044e-07, "loss": 1.3933, "step": 17230 }, { "epoch": 0.862, "grad_norm": 1.6640625, "learning_rate": 4.647357504158312e-07, "loss": 1.4048, "step": 17240 }, { "epoch": 0.8625, "grad_norm": 2.015625, "learning_rate": 4.614280847764574e-07, "loss": 1.3614, "step": 17250 }, { "epoch": 0.863, "grad_norm": 1.8046875, "learning_rate": 4.5813166254075116e-07, "loss": 1.381, "step": 17260 }, { "epoch": 0.8635, "grad_norm": 2.15625, "learning_rate": 4.548464918749379e-07, "loss": 1.3907, "step": 17270 }, { "epoch": 0.864, "grad_norm": 2.0, "learning_rate": 4.515725809173699e-07, "loss": 1.3911, "step": 17280 }, { "epoch": 0.8645, "grad_norm": 1.9609375, "learning_rate": 4.483099377785072e-07, "loss": 1.3744, "step": 17290 }, { "epoch": 0.865, "grad_norm": 1.9140625, "learning_rate": 4.4505857054089554e-07, "loss": 1.3867, "step": 17300 }, { "epoch": 0.8655, "grad_norm": 2.078125, "learning_rate": 4.418184872591447e-07, "loss": 1.3972, "step": 17310 }, { "epoch": 0.866, "grad_norm": 2.046875, "learning_rate": 4.385896959599145e-07, "loss": 1.3662, "step": 17320 }, { "epoch": 0.8665, "grad_norm": 1.9453125, "learning_rate": 4.353722046418901e-07, "loss": 1.3854, "step": 17330 }, { "epoch": 0.867, "grad_norm": 1.9375, "learning_rate": 4.3216602127575947e-07, "loss": 1.392, "step": 17340 }, { "epoch": 0.8675, "grad_norm": 2.0, "learning_rate": 4.289711538042013e-07, "loss": 1.3787, "step": 17350 }, { "epoch": 0.868, "grad_norm": 2.0, "learning_rate": 4.257876101418623e-07, "loss": 1.3889, "step": 17360 }, { "epoch": 0.8685, "grad_norm": 2.046875, "learning_rate": 4.226153981753328e-07, "loss": 1.3837, "step": 17370 }, { "epoch": 0.869, "grad_norm": 1.96875, "learning_rate": 4.194545257631333e-07, "loss": 1.3942, "step": 17380 }, { "epoch": 0.8695, "grad_norm": 1.921875, "learning_rate": 4.1630500073569393e-07, "loss": 1.3979, "step": 17390 }, { "epoch": 0.87, "grad_norm": 2.15625, "learning_rate": 4.1316683089533074e-07, "loss": 1.3944, "step": 17400 }, { "epoch": 0.8705, "grad_norm": 1.890625, "learning_rate": 4.1004002401623335e-07, "loss": 1.3712, "step": 17410 }, { "epoch": 0.871, "grad_norm": 2.375, "learning_rate": 4.0692458784443945e-07, "loss": 1.3673, "step": 17420 }, { "epoch": 0.8715, "grad_norm": 1.90625, "learning_rate": 4.0382053009781763e-07, "loss": 1.3812, "step": 17430 }, { "epoch": 0.872, "grad_norm": 1.984375, "learning_rate": 4.007278584660507e-07, "loss": 1.3801, "step": 17440 }, { "epoch": 0.8725, "grad_norm": 1.9140625, "learning_rate": 3.9764658061061557e-07, "loss": 1.3871, "step": 17450 }, { "epoch": 0.873, "grad_norm": 1.8359375, "learning_rate": 3.94576704164758e-07, "loss": 1.3824, "step": 17460 }, { "epoch": 0.8735, "grad_norm": 2.015625, "learning_rate": 3.915182367334841e-07, "loss": 1.3772, "step": 17470 }, { "epoch": 0.874, "grad_norm": 1.8828125, "learning_rate": 3.8847118589353526e-07, "loss": 1.378, "step": 17480 }, { "epoch": 0.8745, "grad_norm": 1.90625, "learning_rate": 3.854355591933706e-07, "loss": 1.3808, "step": 17490 }, { "epoch": 0.875, "grad_norm": 1.8828125, "learning_rate": 3.8241136415314575e-07, "loss": 1.3894, "step": 17500 }, { "epoch": 0.8755, "grad_norm": 2.015625, "learning_rate": 3.793986082647011e-07, "loss": 1.3773, "step": 17510 }, { "epoch": 0.876, "grad_norm": 1.9375, "learning_rate": 3.7639729899153433e-07, "loss": 1.3633, "step": 17520 }, { "epoch": 0.8765, "grad_norm": 1.9765625, "learning_rate": 3.7340744376879066e-07, "loss": 1.3882, "step": 17530 }, { "epoch": 0.877, "grad_norm": 1.890625, "learning_rate": 3.7042905000323705e-07, "loss": 1.3921, "step": 17540 }, { "epoch": 0.8775, "grad_norm": 2.0, "learning_rate": 3.6746212507324687e-07, "loss": 1.4066, "step": 17550 }, { "epoch": 0.878, "grad_norm": 2.0625, "learning_rate": 3.6450667632878414e-07, "loss": 1.3889, "step": 17560 }, { "epoch": 0.8785, "grad_norm": 1.9765625, "learning_rate": 3.615627110913833e-07, "loss": 1.3667, "step": 17570 }, { "epoch": 0.879, "grad_norm": 1.8671875, "learning_rate": 3.5863023665412557e-07, "loss": 1.3691, "step": 17580 }, { "epoch": 0.8795, "grad_norm": 2.015625, "learning_rate": 3.557092602816309e-07, "loss": 1.4173, "step": 17590 }, { "epoch": 0.88, "grad_norm": 2.140625, "learning_rate": 3.5279978921003544e-07, "loss": 1.3831, "step": 17600 }, { "epoch": 0.8805, "grad_norm": 2.171875, "learning_rate": 3.499018306469704e-07, "loss": 1.3969, "step": 17610 }, { "epoch": 0.881, "grad_norm": 2.109375, "learning_rate": 3.4701539177154686e-07, "loss": 1.389, "step": 17620 }, { "epoch": 0.8815, "grad_norm": 1.8359375, "learning_rate": 3.441404797343412e-07, "loss": 1.4062, "step": 17630 }, { "epoch": 0.882, "grad_norm": 1.953125, "learning_rate": 3.412771016573713e-07, "loss": 1.3866, "step": 17640 }, { "epoch": 0.8825, "grad_norm": 1.9140625, "learning_rate": 3.3842526463408523e-07, "loss": 1.3773, "step": 17650 }, { "epoch": 0.883, "grad_norm": 2.015625, "learning_rate": 3.3558497572933723e-07, "loss": 1.3738, "step": 17660 }, { "epoch": 0.8835, "grad_norm": 1.953125, "learning_rate": 3.3275624197937395e-07, "loss": 1.3806, "step": 17670 }, { "epoch": 0.884, "grad_norm": 1.75, "learning_rate": 3.2993907039181925e-07, "loss": 1.3795, "step": 17680 }, { "epoch": 0.8845, "grad_norm": 2.015625, "learning_rate": 3.271334679456506e-07, "loss": 1.3795, "step": 17690 }, { "epoch": 0.885, "grad_norm": 1.84375, "learning_rate": 3.2433944159118547e-07, "loss": 1.3912, "step": 17700 }, { "epoch": 0.8855, "grad_norm": 1.859375, "learning_rate": 3.2155699825006715e-07, "loss": 1.3781, "step": 17710 }, { "epoch": 0.886, "grad_norm": 1.9921875, "learning_rate": 3.1878614481524184e-07, "loss": 1.3805, "step": 17720 }, { "epoch": 0.8865, "grad_norm": 1.828125, "learning_rate": 3.1602688815094375e-07, "loss": 1.3853, "step": 17730 }, { "epoch": 0.887, "grad_norm": 2.046875, "learning_rate": 3.132792350926783e-07, "loss": 1.3904, "step": 17740 }, { "epoch": 0.8875, "grad_norm": 2.296875, "learning_rate": 3.105431924472085e-07, "loss": 1.3699, "step": 17750 }, { "epoch": 0.888, "grad_norm": 1.9453125, "learning_rate": 3.078187669925298e-07, "loss": 1.3844, "step": 17760 }, { "epoch": 0.8885, "grad_norm": 1.890625, "learning_rate": 3.051059654778632e-07, "loss": 1.3753, "step": 17770 }, { "epoch": 0.889, "grad_norm": 1.9296875, "learning_rate": 3.024047946236308e-07, "loss": 1.381, "step": 17780 }, { "epoch": 0.8895, "grad_norm": 1.9765625, "learning_rate": 2.9971526112144213e-07, "loss": 1.3762, "step": 17790 }, { "epoch": 0.89, "grad_norm": 1.9921875, "learning_rate": 2.970373716340791e-07, "loss": 1.3906, "step": 17800 }, { "epoch": 0.8905, "grad_norm": 2.046875, "learning_rate": 2.943711327954768e-07, "loss": 1.3788, "step": 17810 }, { "epoch": 0.891, "grad_norm": 2.09375, "learning_rate": 2.9171655121070787e-07, "loss": 1.3916, "step": 17820 }, { "epoch": 0.8915, "grad_norm": 2.9375, "learning_rate": 2.890736334559674e-07, "loss": 1.3969, "step": 17830 }, { "epoch": 0.892, "grad_norm": 1.9453125, "learning_rate": 2.8644238607855647e-07, "loss": 1.399, "step": 17840 }, { "epoch": 0.8925, "grad_norm": 1.9921875, "learning_rate": 2.8382281559686087e-07, "loss": 1.3832, "step": 17850 }, { "epoch": 0.893, "grad_norm": 1.890625, "learning_rate": 2.812149285003446e-07, "loss": 1.3981, "step": 17860 }, { "epoch": 0.8935, "grad_norm": 2.25, "learning_rate": 2.786187312495259e-07, "loss": 1.3891, "step": 17870 }, { "epoch": 0.894, "grad_norm": 1.8125, "learning_rate": 2.7603423027596387e-07, "loss": 1.3593, "step": 17880 }, { "epoch": 0.8945, "grad_norm": 2.171875, "learning_rate": 2.7346143198224097e-07, "loss": 1.383, "step": 17890 }, { "epoch": 0.895, "grad_norm": 1.84375, "learning_rate": 2.709003427419521e-07, "loss": 1.3755, "step": 17900 }, { "epoch": 0.8955, "grad_norm": 1.984375, "learning_rate": 2.683509688996816e-07, "loss": 1.3789, "step": 17910 }, { "epoch": 0.896, "grad_norm": 1.90625, "learning_rate": 2.658133167709942e-07, "loss": 1.3856, "step": 17920 }, { "epoch": 0.8965, "grad_norm": 1.734375, "learning_rate": 2.6328739264241566e-07, "loss": 1.3806, "step": 17930 }, { "epoch": 0.897, "grad_norm": 2.09375, "learning_rate": 2.607732027714155e-07, "loss": 1.3835, "step": 17940 }, { "epoch": 0.8975, "grad_norm": 2.046875, "learning_rate": 2.582707533863976e-07, "loss": 1.3875, "step": 17950 }, { "epoch": 0.898, "grad_norm": 1.8359375, "learning_rate": 2.5578005068667976e-07, "loss": 1.3848, "step": 17960 }, { "epoch": 0.8985, "grad_norm": 2.125, "learning_rate": 2.5330110084247797e-07, "loss": 1.3874, "step": 17970 }, { "epoch": 0.899, "grad_norm": 1.8125, "learning_rate": 2.5083390999489533e-07, "loss": 1.388, "step": 17980 }, { "epoch": 0.8995, "grad_norm": 1.8203125, "learning_rate": 2.4837848425590396e-07, "loss": 1.3639, "step": 17990 }, { "epoch": 0.9, "grad_norm": 2.140625, "learning_rate": 2.4593482970832916e-07, "loss": 1.3929, "step": 18000 }, { "epoch": 0.9005, "grad_norm": 2.734375, "learning_rate": 2.4350295240583523e-07, "loss": 1.3891, "step": 18010 }, { "epoch": 0.901, "grad_norm": 2.0625, "learning_rate": 2.41082858372913e-07, "loss": 1.3856, "step": 18020 }, { "epoch": 0.9015, "grad_norm": 1.984375, "learning_rate": 2.3867455360486014e-07, "loss": 1.4009, "step": 18030 }, { "epoch": 0.902, "grad_norm": 2.046875, "learning_rate": 2.3627804406777033e-07, "loss": 1.3874, "step": 18040 }, { "epoch": 0.9025, "grad_norm": 2.265625, "learning_rate": 2.338933356985168e-07, "loss": 1.3726, "step": 18050 }, { "epoch": 0.903, "grad_norm": 2.015625, "learning_rate": 2.315204344047356e-07, "loss": 1.3771, "step": 18060 }, { "epoch": 0.9035, "grad_norm": 1.9453125, "learning_rate": 2.2915934606481616e-07, "loss": 1.3969, "step": 18070 }, { "epoch": 0.904, "grad_norm": 1.7109375, "learning_rate": 2.2681007652788246e-07, "loss": 1.3714, "step": 18080 }, { "epoch": 0.9045, "grad_norm": 1.859375, "learning_rate": 2.2447263161377854e-07, "loss": 1.3684, "step": 18090 }, { "epoch": 0.905, "grad_norm": 1.9140625, "learning_rate": 2.2214701711305742e-07, "loss": 1.3657, "step": 18100 }, { "epoch": 0.9055, "grad_norm": 2.046875, "learning_rate": 2.1983323878696338e-07, "loss": 1.3645, "step": 18110 }, { "epoch": 0.906, "grad_norm": 2.015625, "learning_rate": 2.1753130236741915e-07, "loss": 1.4067, "step": 18120 }, { "epoch": 0.9065, "grad_norm": 1.8359375, "learning_rate": 2.152412135570109e-07, "loss": 1.3805, "step": 18130 }, { "epoch": 0.907, "grad_norm": 1.9453125, "learning_rate": 2.1296297802897713e-07, "loss": 1.3746, "step": 18140 }, { "epoch": 0.9075, "grad_norm": 2.0625, "learning_rate": 2.106966014271883e-07, "loss": 1.3997, "step": 18150 }, { "epoch": 0.908, "grad_norm": 2.03125, "learning_rate": 2.084420893661404e-07, "loss": 1.403, "step": 18160 }, { "epoch": 0.9085, "grad_norm": 2.015625, "learning_rate": 2.0619944743093533e-07, "loss": 1.3789, "step": 18170 }, { "epoch": 0.909, "grad_norm": 2.03125, "learning_rate": 2.0396868117726954e-07, "loss": 1.3878, "step": 18180 }, { "epoch": 0.9095, "grad_norm": 2.125, "learning_rate": 2.0174979613142075e-07, "loss": 1.3848, "step": 18190 }, { "epoch": 0.91, "grad_norm": 2.3125, "learning_rate": 1.99542797790232e-07, "loss": 1.3939, "step": 18200 }, { "epoch": 0.9105, "grad_norm": 1.9453125, "learning_rate": 1.9734769162109924e-07, "loss": 1.3741, "step": 18210 }, { "epoch": 0.911, "grad_norm": 1.9609375, "learning_rate": 1.9516448306196035e-07, "loss": 1.3737, "step": 18220 }, { "epoch": 0.9115, "grad_norm": 2.1875, "learning_rate": 1.9299317752127622e-07, "loss": 1.3983, "step": 18230 }, { "epoch": 0.912, "grad_norm": 1.9921875, "learning_rate": 1.9083378037802302e-07, "loss": 1.3815, "step": 18240 }, { "epoch": 0.9125, "grad_norm": 2.125, "learning_rate": 1.886862969816744e-07, "loss": 1.3787, "step": 18250 }, { "epoch": 0.913, "grad_norm": 1.9296875, "learning_rate": 1.8655073265219093e-07, "loss": 1.3678, "step": 18260 }, { "epoch": 0.9135, "grad_norm": 1.984375, "learning_rate": 1.8442709268000515e-07, "loss": 1.3807, "step": 18270 }, { "epoch": 0.914, "grad_norm": 1.96875, "learning_rate": 1.8231538232601153e-07, "loss": 1.3999, "step": 18280 }, { "epoch": 0.9145, "grad_norm": 4.25, "learning_rate": 1.8021560682154883e-07, "loss": 1.3799, "step": 18290 }, { "epoch": 0.915, "grad_norm": 1.9921875, "learning_rate": 1.7812777136838988e-07, "loss": 1.394, "step": 18300 }, { "epoch": 0.9155, "grad_norm": 1.8828125, "learning_rate": 1.7605188113873128e-07, "loss": 1.3695, "step": 18310 }, { "epoch": 0.916, "grad_norm": 1.7890625, "learning_rate": 1.7398794127517438e-07, "loss": 1.3873, "step": 18320 }, { "epoch": 0.9165, "grad_norm": 2.0625, "learning_rate": 1.7193595689071696e-07, "loss": 1.3781, "step": 18330 }, { "epoch": 0.917, "grad_norm": 1.9140625, "learning_rate": 1.6989593306874053e-07, "loss": 1.387, "step": 18340 }, { "epoch": 0.9175, "grad_norm": 1.9765625, "learning_rate": 1.6786787486299528e-07, "loss": 1.3918, "step": 18350 }, { "epoch": 0.918, "grad_norm": 2.03125, "learning_rate": 1.6585178729758956e-07, "loss": 1.3922, "step": 18360 }, { "epoch": 0.9185, "grad_norm": 1.796875, "learning_rate": 1.638476753669771e-07, "loss": 1.3764, "step": 18370 }, { "epoch": 0.919, "grad_norm": 1.953125, "learning_rate": 1.6185554403594371e-07, "loss": 1.3855, "step": 18380 }, { "epoch": 0.9195, "grad_norm": 2.046875, "learning_rate": 1.598753982395962e-07, "loss": 1.3862, "step": 18390 }, { "epoch": 0.92, "grad_norm": 1.90625, "learning_rate": 1.579072428833506e-07, "loss": 1.37, "step": 18400 }, { "epoch": 0.9205, "grad_norm": 1.9375, "learning_rate": 1.5595108284291726e-07, "loss": 1.3811, "step": 18410 }, { "epoch": 0.921, "grad_norm": 2.015625, "learning_rate": 1.5400692296428988e-07, "loss": 1.3795, "step": 18420 }, { "epoch": 0.9215, "grad_norm": 1.796875, "learning_rate": 1.5207476806373745e-07, "loss": 1.3738, "step": 18430 }, { "epoch": 0.922, "grad_norm": 1.8671875, "learning_rate": 1.5015462292778615e-07, "loss": 1.4014, "step": 18440 }, { "epoch": 0.9225, "grad_norm": 2.015625, "learning_rate": 1.4824649231321097e-07, "loss": 1.3662, "step": 18450 }, { "epoch": 0.923, "grad_norm": 1.78125, "learning_rate": 1.4635038094702515e-07, "loss": 1.3868, "step": 18460 }, { "epoch": 0.9235, "grad_norm": 1.8359375, "learning_rate": 1.4446629352646345e-07, "loss": 1.3803, "step": 18470 }, { "epoch": 0.924, "grad_norm": 2.03125, "learning_rate": 1.4259423471897572e-07, "loss": 1.3827, "step": 18480 }, { "epoch": 0.9245, "grad_norm": 1.9609375, "learning_rate": 1.407342091622138e-07, "loss": 1.3763, "step": 18490 }, { "epoch": 0.925, "grad_norm": 1.9296875, "learning_rate": 1.38886221464018e-07, "loss": 1.3737, "step": 18500 }, { "epoch": 0.9255, "grad_norm": 2.046875, "learning_rate": 1.3705027620240897e-07, "loss": 1.3929, "step": 18510 }, { "epoch": 0.926, "grad_norm": 1.8828125, "learning_rate": 1.3522637792557248e-07, "loss": 1.3842, "step": 18520 }, { "epoch": 0.9265, "grad_norm": 1.8515625, "learning_rate": 1.3341453115185255e-07, "loss": 1.3754, "step": 18530 }, { "epoch": 0.927, "grad_norm": 1.8359375, "learning_rate": 1.3161474036973598e-07, "loss": 1.3837, "step": 18540 }, { "epoch": 0.9275, "grad_norm": 1.8515625, "learning_rate": 1.298270100378457e-07, "loss": 1.3969, "step": 18550 }, { "epoch": 0.928, "grad_norm": 1.96875, "learning_rate": 1.2805134458492463e-07, "loss": 1.3959, "step": 18560 }, { "epoch": 0.9285, "grad_norm": 2.15625, "learning_rate": 1.2628774840982849e-07, "loss": 1.3972, "step": 18570 }, { "epoch": 0.929, "grad_norm": 1.734375, "learning_rate": 1.2453622588151525e-07, "loss": 1.3831, "step": 18580 }, { "epoch": 0.9295, "grad_norm": 2.1875, "learning_rate": 1.2279678133903016e-07, "loss": 1.393, "step": 18590 }, { "epoch": 0.93, "grad_norm": 2.1875, "learning_rate": 1.2106941909149906e-07, "loss": 1.3922, "step": 18600 }, { "epoch": 0.9305, "grad_norm": 2.984375, "learning_rate": 1.1935414341811668e-07, "loss": 1.3844, "step": 18610 }, { "epoch": 0.931, "grad_norm": 1.8203125, "learning_rate": 1.1765095856813458e-07, "loss": 1.3845, "step": 18620 }, { "epoch": 0.9315, "grad_norm": 1.921875, "learning_rate": 1.1595986876085375e-07, "loss": 1.3895, "step": 18630 }, { "epoch": 0.932, "grad_norm": 1.8984375, "learning_rate": 1.1428087818560973e-07, "loss": 1.3867, "step": 18640 }, { "epoch": 0.9325, "grad_norm": 1.9921875, "learning_rate": 1.1261399100176596e-07, "loss": 1.3862, "step": 18650 }, { "epoch": 0.933, "grad_norm": 1.8671875, "learning_rate": 1.1095921133870091e-07, "loss": 1.387, "step": 18660 }, { "epoch": 0.9335, "grad_norm": 2.203125, "learning_rate": 1.093165432958021e-07, "loss": 1.3823, "step": 18670 }, { "epoch": 0.934, "grad_norm": 1.8359375, "learning_rate": 1.0768599094244936e-07, "loss": 1.3721, "step": 18680 }, { "epoch": 0.9345, "grad_norm": 1.8125, "learning_rate": 1.0606755831800986e-07, "loss": 1.3805, "step": 18690 }, { "epoch": 0.935, "grad_norm": 1.8671875, "learning_rate": 1.04461249431827e-07, "loss": 1.3769, "step": 18700 }, { "epoch": 0.9355, "grad_norm": 2.046875, "learning_rate": 1.0286706826320936e-07, "loss": 1.3837, "step": 18710 }, { "epoch": 0.936, "grad_norm": 2.0625, "learning_rate": 1.0128501876142172e-07, "loss": 1.3854, "step": 18720 }, { "epoch": 0.9365, "grad_norm": 2.09375, "learning_rate": 9.97151048456757e-08, "loss": 1.3878, "step": 18730 }, { "epoch": 0.937, "grad_norm": 1.8125, "learning_rate": 9.81573304051181e-08, "loss": 1.3747, "step": 18740 }, { "epoch": 0.9375, "grad_norm": 2.0625, "learning_rate": 9.661169929882364e-08, "loss": 1.3706, "step": 18750 }, { "epoch": 0.938, "grad_norm": 2.1875, "learning_rate": 9.507821535578444e-08, "loss": 1.3931, "step": 18760 }, { "epoch": 0.9385, "grad_norm": 2.015625, "learning_rate": 9.35568823748989e-08, "loss": 1.3816, "step": 18770 }, { "epoch": 0.939, "grad_norm": 1.8671875, "learning_rate": 9.204770412496621e-08, "loss": 1.3966, "step": 18780 }, { "epoch": 0.9395, "grad_norm": 1.9296875, "learning_rate": 9.055068434467407e-08, "loss": 1.3921, "step": 18790 }, { "epoch": 0.94, "grad_norm": 2.109375, "learning_rate": 8.906582674258757e-08, "loss": 1.3907, "step": 18800 }, { "epoch": 0.9405, "grad_norm": 2.71875, "learning_rate": 8.759313499714539e-08, "loss": 1.3847, "step": 18810 }, { "epoch": 0.941, "grad_norm": 2.09375, "learning_rate": 8.613261275664642e-08, "loss": 1.3794, "step": 18820 }, { "epoch": 0.9415, "grad_norm": 1.890625, "learning_rate": 8.468426363924254e-08, "loss": 1.3733, "step": 18830 }, { "epoch": 0.942, "grad_norm": 1.9921875, "learning_rate": 8.324809123292754e-08, "loss": 1.3799, "step": 18840 }, { "epoch": 0.9425, "grad_norm": 1.9140625, "learning_rate": 8.182409909553324e-08, "loss": 1.3892, "step": 18850 }, { "epoch": 0.943, "grad_norm": 2.0625, "learning_rate": 8.041229075471336e-08, "loss": 1.3827, "step": 18860 }, { "epoch": 0.9435, "grad_norm": 1.9140625, "learning_rate": 7.901266970794186e-08, "loss": 1.3802, "step": 18870 }, { "epoch": 0.944, "grad_norm": 1.828125, "learning_rate": 7.762523942249855e-08, "loss": 1.4013, "step": 18880 }, { "epoch": 0.9445, "grad_norm": 1.84375, "learning_rate": 7.625000333546517e-08, "loss": 1.3921, "step": 18890 }, { "epoch": 0.945, "grad_norm": 1.9140625, "learning_rate": 7.488696485371372e-08, "loss": 1.3799, "step": 18900 }, { "epoch": 0.9455, "grad_norm": 1.8671875, "learning_rate": 7.353612735389982e-08, "loss": 1.3729, "step": 18910 }, { "epoch": 0.946, "grad_norm": 1.953125, "learning_rate": 7.219749418245158e-08, "loss": 1.3855, "step": 18920 }, { "epoch": 0.9465, "grad_norm": 2.125, "learning_rate": 7.087106865556582e-08, "loss": 1.3881, "step": 18930 }, { "epoch": 0.947, "grad_norm": 2.0, "learning_rate": 6.955685405919677e-08, "loss": 1.3853, "step": 18940 }, { "epoch": 0.9475, "grad_norm": 2.03125, "learning_rate": 6.825485364904738e-08, "loss": 1.3897, "step": 18950 }, { "epoch": 0.948, "grad_norm": 2.671875, "learning_rate": 6.696507065056312e-08, "loss": 1.3851, "step": 18960 }, { "epoch": 0.9485, "grad_norm": 2.34375, "learning_rate": 6.568750825892367e-08, "loss": 1.3855, "step": 18970 }, { "epoch": 0.949, "grad_norm": 2.28125, "learning_rate": 6.442216963903347e-08, "loss": 1.3894, "step": 18980 }, { "epoch": 0.9495, "grad_norm": 1.9375, "learning_rate": 6.31690579255162e-08, "loss": 1.3893, "step": 18990 }, { "epoch": 0.95, "grad_norm": 2.140625, "learning_rate": 6.192817622270586e-08, "loss": 1.3754, "step": 19000 }, { "epoch": 0.9505, "grad_norm": 2.015625, "learning_rate": 6.069952760463738e-08, "loss": 1.3881, "step": 19010 }, { "epoch": 0.951, "grad_norm": 1.9375, "learning_rate": 5.948311511504323e-08, "loss": 1.3841, "step": 19020 }, { "epoch": 0.9515, "grad_norm": 2.015625, "learning_rate": 5.8278941767340726e-08, "loss": 1.3798, "step": 19030 }, { "epoch": 0.952, "grad_norm": 2.03125, "learning_rate": 5.7087010544628616e-08, "loss": 1.3881, "step": 19040 }, { "epoch": 0.9525, "grad_norm": 1.7421875, "learning_rate": 5.5907324399678276e-08, "loss": 1.3884, "step": 19050 }, { "epoch": 0.953, "grad_norm": 2.203125, "learning_rate": 5.473988625492699e-08, "loss": 1.3869, "step": 19060 }, { "epoch": 0.9535, "grad_norm": 2.21875, "learning_rate": 5.358469900246688e-08, "loss": 1.3847, "step": 19070 }, { "epoch": 0.954, "grad_norm": 1.953125, "learning_rate": 5.244176550404434e-08, "loss": 1.3677, "step": 19080 }, { "epoch": 0.9545, "grad_norm": 1.796875, "learning_rate": 5.1311088591049496e-08, "loss": 1.389, "step": 19090 }, { "epoch": 0.955, "grad_norm": 1.8359375, "learning_rate": 5.019267106450621e-08, "loss": 1.3873, "step": 19100 }, { "epoch": 0.9555, "grad_norm": 2.078125, "learning_rate": 4.90865156950715e-08, "loss": 1.3717, "step": 19110 }, { "epoch": 0.956, "grad_norm": 1.953125, "learning_rate": 4.7992625223022837e-08, "loss": 1.4008, "step": 19120 }, { "epoch": 0.9565, "grad_norm": 1.671875, "learning_rate": 4.691100235825585e-08, "loss": 1.3721, "step": 19130 }, { "epoch": 0.957, "grad_norm": 1.9921875, "learning_rate": 4.584164978027439e-08, "loss": 1.3948, "step": 19140 }, { "epoch": 0.9575, "grad_norm": 2.09375, "learning_rate": 4.478457013818604e-08, "loss": 1.4049, "step": 19150 }, { "epoch": 0.958, "grad_norm": 2.0, "learning_rate": 4.373976605069274e-08, "loss": 1.3962, "step": 19160 }, { "epoch": 0.9585, "grad_norm": 1.8359375, "learning_rate": 4.270724010608851e-08, "loss": 1.3732, "step": 19170 }, { "epoch": 0.959, "grad_norm": 2.15625, "learning_rate": 4.168699486225003e-08, "loss": 1.3942, "step": 19180 }, { "epoch": 0.9595, "grad_norm": 1.8984375, "learning_rate": 4.067903284662944e-08, "loss": 1.3779, "step": 19190 }, { "epoch": 0.96, "grad_norm": 2.078125, "learning_rate": 3.968335655625211e-08, "loss": 1.3844, "step": 19200 }, { "epoch": 0.9605, "grad_norm": 1.7890625, "learning_rate": 3.8699968457706627e-08, "loss": 1.39, "step": 19210 }, { "epoch": 0.961, "grad_norm": 2.1875, "learning_rate": 3.7728870987139845e-08, "loss": 1.3646, "step": 19220 }, { "epoch": 0.9615, "grad_norm": 2.0, "learning_rate": 3.677006655025128e-08, "loss": 1.3772, "step": 19230 }, { "epoch": 0.962, "grad_norm": 2.09375, "learning_rate": 3.5823557522287614e-08, "loss": 1.3968, "step": 19240 }, { "epoch": 0.9625, "grad_norm": 1.9140625, "learning_rate": 3.4889346248035414e-08, "loss": 1.3735, "step": 19250 }, { "epoch": 0.963, "grad_norm": 1.921875, "learning_rate": 3.396743504181565e-08, "loss": 1.3644, "step": 19260 }, { "epoch": 0.9635, "grad_norm": 1.953125, "learning_rate": 3.3057826187479766e-08, "loss": 1.3917, "step": 19270 }, { "epoch": 0.964, "grad_norm": 1.9140625, "learning_rate": 3.216052193840025e-08, "loss": 1.3829, "step": 19280 }, { "epoch": 0.9645, "grad_norm": 1.9375, "learning_rate": 3.127552451747007e-08, "loss": 1.3856, "step": 19290 }, { "epoch": 0.965, "grad_norm": 2.578125, "learning_rate": 3.040283611709327e-08, "loss": 1.3859, "step": 19300 }, { "epoch": 0.9655, "grad_norm": 1.96875, "learning_rate": 2.954245889917995e-08, "loss": 1.3911, "step": 19310 }, { "epoch": 0.966, "grad_norm": 2.0, "learning_rate": 2.869439499514237e-08, "loss": 1.3645, "step": 19320 }, { "epoch": 0.9665, "grad_norm": 2.390625, "learning_rate": 2.785864650589054e-08, "loss": 1.3971, "step": 19330 }, { "epoch": 0.967, "grad_norm": 2.03125, "learning_rate": 2.7035215501822753e-08, "loss": 1.3914, "step": 19340 }, { "epoch": 0.9675, "grad_norm": 4.375, "learning_rate": 2.622410402282449e-08, "loss": 1.3892, "step": 19350 }, { "epoch": 0.968, "grad_norm": 2.265625, "learning_rate": 2.5425314078262875e-08, "loss": 1.3833, "step": 19360 }, { "epoch": 0.9685, "grad_norm": 2.265625, "learning_rate": 2.4638847646979992e-08, "loss": 1.3661, "step": 19370 }, { "epoch": 0.969, "grad_norm": 2.109375, "learning_rate": 2.3864706677288462e-08, "loss": 1.413, "step": 19380 }, { "epoch": 0.9695, "grad_norm": 2.03125, "learning_rate": 2.310289308696867e-08, "loss": 1.3846, "step": 19390 }, { "epoch": 0.97, "grad_norm": 1.8828125, "learning_rate": 2.2353408763259866e-08, "loss": 1.3775, "step": 19400 }, { "epoch": 0.9705, "grad_norm": 1.9453125, "learning_rate": 2.1616255562860734e-08, "loss": 1.3743, "step": 19410 }, { "epoch": 0.971, "grad_norm": 1.7734375, "learning_rate": 2.0891435311920506e-08, "loss": 1.3746, "step": 19420 }, { "epoch": 0.9715, "grad_norm": 1.9609375, "learning_rate": 2.0178949806036184e-08, "loss": 1.3776, "step": 19430 }, { "epoch": 0.972, "grad_norm": 1.8125, "learning_rate": 1.947880081024811e-08, "loss": 1.3821, "step": 19440 }, { "epoch": 0.9725, "grad_norm": 2.03125, "learning_rate": 1.8790990059034953e-08, "loss": 1.3855, "step": 19450 }, { "epoch": 0.973, "grad_norm": 1.8984375, "learning_rate": 1.8115519256310387e-08, "loss": 1.3719, "step": 19460 }, { "epoch": 0.9735, "grad_norm": 1.984375, "learning_rate": 1.7452390075418657e-08, "loss": 1.3816, "step": 19470 }, { "epoch": 0.974, "grad_norm": 2.078125, "learning_rate": 1.680160415912846e-08, "loss": 1.3852, "step": 19480 }, { "epoch": 0.9745, "grad_norm": 2.34375, "learning_rate": 1.616316311963295e-08, "loss": 1.3923, "step": 19490 }, { "epoch": 0.975, "grad_norm": 2.0625, "learning_rate": 1.553706853854142e-08, "loss": 1.3752, "step": 19500 }, { "epoch": 0.9755, "grad_norm": 1.8984375, "learning_rate": 1.492332196687818e-08, "loss": 1.3914, "step": 19510 }, { "epoch": 0.976, "grad_norm": 5.8125, "learning_rate": 1.4321924925077003e-08, "loss": 1.3788, "step": 19520 }, { "epoch": 0.9765, "grad_norm": 1.9765625, "learning_rate": 1.373287890297892e-08, "loss": 1.3697, "step": 19530 }, { "epoch": 0.977, "grad_norm": 1.96875, "learning_rate": 1.3156185359827767e-08, "loss": 1.3913, "step": 19540 }, { "epoch": 0.9775, "grad_norm": 1.90625, "learning_rate": 1.2591845724266304e-08, "loss": 1.4124, "step": 19550 }, { "epoch": 0.978, "grad_norm": 2.40625, "learning_rate": 1.203986139433344e-08, "loss": 1.3971, "step": 19560 }, { "epoch": 0.9785, "grad_norm": 1.8671875, "learning_rate": 1.1500233737459232e-08, "loss": 1.386, "step": 19570 }, { "epoch": 0.979, "grad_norm": 1.8046875, "learning_rate": 1.0972964090463223e-08, "loss": 1.3951, "step": 19580 }, { "epoch": 0.9795, "grad_norm": 1.8515625, "learning_rate": 1.0458053759550558e-08, "loss": 1.3845, "step": 19590 }, { "epoch": 0.98, "grad_norm": 2.28125, "learning_rate": 9.955504020309204e-09, "loss": 1.391, "step": 19600 }, { "epoch": 0.9805, "grad_norm": 1.8515625, "learning_rate": 9.46531611770496e-09, "loss": 1.356, "step": 19610 }, { "epoch": 0.981, "grad_norm": 1.890625, "learning_rate": 8.98749126608034e-09, "loss": 1.3849, "step": 19620 }, { "epoch": 0.9815, "grad_norm": 2.09375, "learning_rate": 8.522030649150691e-09, "loss": 1.3819, "step": 19630 }, { "epoch": 0.982, "grad_norm": 1.9921875, "learning_rate": 8.068935420001423e-09, "loss": 1.385, "step": 19640 }, { "epoch": 0.9825, "grad_norm": 2.640625, "learning_rate": 7.628206701085772e-09, "loss": 1.3784, "step": 19650 }, { "epoch": 0.983, "grad_norm": 2.171875, "learning_rate": 7.199845584220377e-09, "loss": 1.3823, "step": 19660 }, { "epoch": 0.9835, "grad_norm": 2.265625, "learning_rate": 6.783853130584717e-09, "loss": 1.4043, "step": 19670 }, { "epoch": 0.984, "grad_norm": 2.3125, "learning_rate": 6.380230370716667e-09, "loss": 1.3952, "step": 19680 }, { "epoch": 0.9845, "grad_norm": 2.265625, "learning_rate": 5.98897830451084e-09, "loss": 1.3852, "step": 19690 }, { "epoch": 0.985, "grad_norm": 2.171875, "learning_rate": 5.610097901215805e-09, "loss": 1.4121, "step": 19700 }, { "epoch": 0.9855, "grad_norm": 1.9921875, "learning_rate": 5.243590099431872e-09, "loss": 1.3729, "step": 19710 }, { "epoch": 0.986, "grad_norm": 1.734375, "learning_rate": 4.889455807108867e-09, "loss": 1.3713, "step": 19720 }, { "epoch": 0.9865, "grad_norm": 1.765625, "learning_rate": 4.547695901543914e-09, "loss": 1.3926, "step": 19730 }, { "epoch": 0.987, "grad_norm": 2.40625, "learning_rate": 4.218311229378658e-09, "loss": 1.3775, "step": 19740 }, { "epoch": 0.9875, "grad_norm": 1.7890625, "learning_rate": 3.901302606597601e-09, "loss": 1.3758, "step": 19750 }, { "epoch": 0.988, "grad_norm": 1.8671875, "learning_rate": 3.5966708185258823e-09, "loss": 1.3899, "step": 19760 }, { "epoch": 0.9885, "grad_norm": 1.859375, "learning_rate": 3.304416619828166e-09, "loss": 1.3754, "step": 19770 }, { "epoch": 0.989, "grad_norm": 2.03125, "learning_rate": 3.024540734505865e-09, "loss": 1.3787, "step": 19780 }, { "epoch": 0.9895, "grad_norm": 2.078125, "learning_rate": 2.757043855894925e-09, "loss": 1.3878, "step": 19790 }, { "epoch": 0.99, "grad_norm": 1.7890625, "learning_rate": 2.501926646666375e-09, "loss": 1.3787, "step": 19800 }, { "epoch": 0.9905, "grad_norm": 2.1875, "learning_rate": 2.259189738820777e-09, "loss": 1.3747, "step": 19810 }, { "epoch": 0.991, "grad_norm": 1.8359375, "learning_rate": 2.0288337336910046e-09, "loss": 1.3917, "step": 19820 }, { "epoch": 0.9915, "grad_norm": 2.25, "learning_rate": 1.810859201937798e-09, "loss": 1.3908, "step": 19830 }, { "epoch": 0.992, "grad_norm": 1.7109375, "learning_rate": 1.6052666835492114e-09, "loss": 1.3675, "step": 19840 }, { "epoch": 0.9925, "grad_norm": 1.96875, "learning_rate": 1.4120566878400576e-09, "loss": 1.3804, "step": 19850 }, { "epoch": 0.993, "grad_norm": 2.078125, "learning_rate": 1.231229693449132e-09, "loss": 1.3751, "step": 19860 }, { "epoch": 0.9935, "grad_norm": 2.140625, "learning_rate": 1.062786148338657e-09, "loss": 1.3863, "step": 19870 }, { "epoch": 0.994, "grad_norm": 1.90625, "learning_rate": 9.067264697948386e-10, "loss": 1.3874, "step": 19880 }, { "epoch": 0.9945, "grad_norm": 1.9296875, "learning_rate": 7.630510444234241e-10, "loss": 1.3892, "step": 19890 }, { "epoch": 0.995, "grad_norm": 1.9765625, "learning_rate": 6.317602281524781e-10, "loss": 1.3652, "step": 19900 }, { "epoch": 0.9955, "grad_norm": 2.09375, "learning_rate": 5.128543462273872e-10, "loss": 1.3818, "step": 19910 }, { "epoch": 0.996, "grad_norm": 1.84375, "learning_rate": 4.0633369321474437e-10, "loss": 1.3781, "step": 19920 }, { "epoch": 0.9965, "grad_norm": 1.7890625, "learning_rate": 3.1219853299790937e-10, "loss": 1.3825, "step": 19930 }, { "epoch": 0.997, "grad_norm": 2.109375, "learning_rate": 2.3044909877756317e-10, "loss": 1.3787, "step": 19940 }, { "epoch": 0.9975, "grad_norm": 2.078125, "learning_rate": 1.6108559307170812e-10, "loss": 1.4002, "step": 19950 }, { "epoch": 0.998, "grad_norm": 1.8828125, "learning_rate": 1.0410818771511289e-10, "loss": 1.3791, "step": 19960 }, { "epoch": 0.9985, "grad_norm": 1.9140625, "learning_rate": 5.95170238570919e-11, "loss": 1.3796, "step": 19970 }, { "epoch": 0.999, "grad_norm": 1.84375, "learning_rate": 2.7312211963725908e-11, "loss": 1.3787, "step": 19980 }, { "epoch": 0.9995, "grad_norm": 2.15625, "learning_rate": 7.49383181564145e-12, "loss": 1.3843, "step": 19990 }, { "epoch": 1.0, "grad_norm": 1.6796875, "learning_rate": 6.193250967623244e-14, "loss": 1.3795, "step": 20000 } ], "logging_steps": 10, "max_steps": 20000, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 2000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.453599608166941e+19, "train_batch_size": 1, "trial_name": null, "trial_params": null }