| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 2.9950900163666123, |
| "eval_steps": 500, |
| "global_step": 1830, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.008183306055646482, |
| "grad_norm": 2.34375, |
| "learning_rate": 1.142857142857143e-06, |
| "loss": 0.6534, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.016366612111292964, |
| "grad_norm": 2.15625, |
| "learning_rate": 2.571428571428571e-06, |
| "loss": 0.6508, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.024549918166939442, |
| "grad_norm": 1.84375, |
| "learning_rate": 4.000000000000001e-06, |
| "loss": 0.6493, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.03273322422258593, |
| "grad_norm": 1.4453125, |
| "learning_rate": 5.428571428571429e-06, |
| "loss": 0.6052, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.04091653027823241, |
| "grad_norm": 1.0625, |
| "learning_rate": 6.857142857142858e-06, |
| "loss": 0.5644, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.049099836333878884, |
| "grad_norm": 0.76953125, |
| "learning_rate": 8.285714285714287e-06, |
| "loss": 0.5195, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.057283142389525366, |
| "grad_norm": 0.63671875, |
| "learning_rate": 9.714285714285715e-06, |
| "loss": 0.5033, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.06546644844517185, |
| "grad_norm": 0.59375, |
| "learning_rate": 9.99990841172964e-06, |
| "loss": 0.4931, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.07364975450081833, |
| "grad_norm": 0.56640625, |
| "learning_rate": 9.999536342048818e-06, |
| "loss": 0.4682, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.08183306055646482, |
| "grad_norm": 0.60546875, |
| "learning_rate": 9.998878095066407e-06, |
| "loss": 0.4627, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.09001636661211129, |
| "grad_norm": 0.53125, |
| "learning_rate": 9.997933721022044e-06, |
| "loss": 0.4458, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.09819967266775777, |
| "grad_norm": 0.55078125, |
| "learning_rate": 9.996703291993557e-06, |
| "loss": 0.4567, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.10638297872340426, |
| "grad_norm": 0.52734375, |
| "learning_rate": 9.995186901891448e-06, |
| "loss": 0.4644, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.11456628477905073, |
| "grad_norm": 0.53515625, |
| "learning_rate": 9.993384666451743e-06, |
| "loss": 0.4547, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.12274959083469722, |
| "grad_norm": 0.55859375, |
| "learning_rate": 9.991296723227148e-06, |
| "loss": 0.4546, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.1309328968903437, |
| "grad_norm": 0.5234375, |
| "learning_rate": 9.988923231576558e-06, |
| "loss": 0.4334, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.13911620294599017, |
| "grad_norm": 0.5078125, |
| "learning_rate": 9.986264372652883e-06, |
| "loss": 0.4419, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.14729950900163666, |
| "grad_norm": 0.53125, |
| "learning_rate": 9.983320349389237e-06, |
| "loss": 0.4409, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.15548281505728315, |
| "grad_norm": 0.51953125, |
| "learning_rate": 9.980091386483434e-06, |
| "loss": 0.4618, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.16366612111292964, |
| "grad_norm": 0.515625, |
| "learning_rate": 9.976577730380855e-06, |
| "loss": 0.4488, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.1718494271685761, |
| "grad_norm": 0.515625, |
| "learning_rate": 9.972779649255617e-06, |
| "loss": 0.4408, |
| "step": 105 |
| }, |
| { |
| "epoch": 0.18003273322422259, |
| "grad_norm": 0.5234375, |
| "learning_rate": 9.968697432990129e-06, |
| "loss": 0.4423, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.18821603927986907, |
| "grad_norm": 0.5078125, |
| "learning_rate": 9.964331393152947e-06, |
| "loss": 0.4503, |
| "step": 115 |
| }, |
| { |
| "epoch": 0.19639934533551553, |
| "grad_norm": 0.51171875, |
| "learning_rate": 9.959681862975007e-06, |
| "loss": 0.4282, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.20458265139116202, |
| "grad_norm": 0.51171875, |
| "learning_rate": 9.954749197324184e-06, |
| "loss": 0.4336, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.2127659574468085, |
| "grad_norm": 0.51953125, |
| "learning_rate": 9.949533772678215e-06, |
| "loss": 0.4337, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.220949263502455, |
| "grad_norm": 0.5234375, |
| "learning_rate": 9.944035987095955e-06, |
| "loss": 0.4365, |
| "step": 135 |
| }, |
| { |
| "epoch": 0.22913256955810146, |
| "grad_norm": 0.51953125, |
| "learning_rate": 9.938256260187002e-06, |
| "loss": 0.4259, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.23731587561374795, |
| "grad_norm": 0.51171875, |
| "learning_rate": 9.932195033079677e-06, |
| "loss": 0.4291, |
| "step": 145 |
| }, |
| { |
| "epoch": 0.24549918166939444, |
| "grad_norm": 0.515625, |
| "learning_rate": 9.925852768387337e-06, |
| "loss": 0.4362, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.25368248772504093, |
| "grad_norm": 0.5390625, |
| "learning_rate": 9.919229950173089e-06, |
| "loss": 0.4434, |
| "step": 155 |
| }, |
| { |
| "epoch": 0.2618657937806874, |
| "grad_norm": 0.51953125, |
| "learning_rate": 9.912327083912825e-06, |
| "loss": 0.4278, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.2700490998363339, |
| "grad_norm": 0.53515625, |
| "learning_rate": 9.905144696456664e-06, |
| "loss": 0.4324, |
| "step": 165 |
| }, |
| { |
| "epoch": 0.27823240589198034, |
| "grad_norm": 0.51953125, |
| "learning_rate": 9.897683335988714e-06, |
| "loss": 0.4505, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.2864157119476268, |
| "grad_norm": 0.515625, |
| "learning_rate": 9.889943571985258e-06, |
| "loss": 0.4329, |
| "step": 175 |
| }, |
| { |
| "epoch": 0.2945990180032733, |
| "grad_norm": 0.5234375, |
| "learning_rate": 9.881925995171272e-06, |
| "loss": 0.4439, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.3027823240589198, |
| "grad_norm": 0.53125, |
| "learning_rate": 9.873631217475355e-06, |
| "loss": 0.4452, |
| "step": 185 |
| }, |
| { |
| "epoch": 0.3109656301145663, |
| "grad_norm": 0.55078125, |
| "learning_rate": 9.865059871983003e-06, |
| "loss": 0.4329, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.3191489361702128, |
| "grad_norm": 0.515625, |
| "learning_rate": 9.856212612888312e-06, |
| "loss": 0.4224, |
| "step": 195 |
| }, |
| { |
| "epoch": 0.32733224222585927, |
| "grad_norm": 0.54296875, |
| "learning_rate": 9.847090115444032e-06, |
| "loss": 0.4186, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.3355155482815057, |
| "grad_norm": 0.515625, |
| "learning_rate": 9.837693075910037e-06, |
| "loss": 0.4178, |
| "step": 205 |
| }, |
| { |
| "epoch": 0.3436988543371522, |
| "grad_norm": 0.53125, |
| "learning_rate": 9.828022211500183e-06, |
| "loss": 0.441, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.3518821603927987, |
| "grad_norm": 0.54296875, |
| "learning_rate": 9.81807826032757e-06, |
| "loss": 0.4261, |
| "step": 215 |
| }, |
| { |
| "epoch": 0.36006546644844517, |
| "grad_norm": 0.515625, |
| "learning_rate": 9.807861981348196e-06, |
| "loss": 0.422, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.36824877250409166, |
| "grad_norm": 0.498046875, |
| "learning_rate": 9.797374154303048e-06, |
| "loss": 0.4346, |
| "step": 225 |
| }, |
| { |
| "epoch": 0.37643207855973815, |
| "grad_norm": 0.53515625, |
| "learning_rate": 9.786615579658571e-06, |
| "loss": 0.4226, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.38461538461538464, |
| "grad_norm": 0.53125, |
| "learning_rate": 9.77558707854559e-06, |
| "loss": 0.4136, |
| "step": 235 |
| }, |
| { |
| "epoch": 0.39279869067103107, |
| "grad_norm": 0.52734375, |
| "learning_rate": 9.764289492696628e-06, |
| "loss": 0.4292, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.40098199672667756, |
| "grad_norm": 0.53125, |
| "learning_rate": 9.752723684381666e-06, |
| "loss": 0.4127, |
| "step": 245 |
| }, |
| { |
| "epoch": 0.40916530278232405, |
| "grad_norm": 0.52734375, |
| "learning_rate": 9.740890536342336e-06, |
| "loss": 0.4383, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.41734860883797054, |
| "grad_norm": 0.51953125, |
| "learning_rate": 9.728790951724532e-06, |
| "loss": 0.4216, |
| "step": 255 |
| }, |
| { |
| "epoch": 0.425531914893617, |
| "grad_norm": 0.5, |
| "learning_rate": 9.716425854009501e-06, |
| "loss": 0.4117, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.4337152209492635, |
| "grad_norm": 0.515625, |
| "learning_rate": 9.70379618694334e-06, |
| "loss": 0.427, |
| "step": 265 |
| }, |
| { |
| "epoch": 0.44189852700491, |
| "grad_norm": 0.51171875, |
| "learning_rate": 9.690902914464977e-06, |
| "loss": 0.4267, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.4500818330605565, |
| "grad_norm": 0.5078125, |
| "learning_rate": 9.677747020632595e-06, |
| "loss": 0.4178, |
| "step": 275 |
| }, |
| { |
| "epoch": 0.4582651391162029, |
| "grad_norm": 0.50390625, |
| "learning_rate": 9.664329509548534e-06, |
| "loss": 0.4094, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.4664484451718494, |
| "grad_norm": 0.50390625, |
| "learning_rate": 9.650651405282638e-06, |
| "loss": 0.402, |
| "step": 285 |
| }, |
| { |
| "epoch": 0.4746317512274959, |
| "grad_norm": 0.5078125, |
| "learning_rate": 9.63671375179411e-06, |
| "loss": 0.4147, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.4828150572831424, |
| "grad_norm": 0.494140625, |
| "learning_rate": 9.622517612851832e-06, |
| "loss": 0.4107, |
| "step": 295 |
| }, |
| { |
| "epoch": 0.4909983633387889, |
| "grad_norm": 0.5078125, |
| "learning_rate": 9.608064071953162e-06, |
| "loss": 0.4152, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.49918166939443537, |
| "grad_norm": 0.5234375, |
| "learning_rate": 9.593354232241251e-06, |
| "loss": 0.4308, |
| "step": 305 |
| }, |
| { |
| "epoch": 0.5073649754500819, |
| "grad_norm": 0.51171875, |
| "learning_rate": 9.578389216420844e-06, |
| "loss": 0.4145, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.5155482815057283, |
| "grad_norm": 0.515625, |
| "learning_rate": 9.563170166672585e-06, |
| "loss": 0.4062, |
| "step": 315 |
| }, |
| { |
| "epoch": 0.5237315875613748, |
| "grad_norm": 0.51953125, |
| "learning_rate": 9.547698244565855e-06, |
| "loss": 0.419, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.5319148936170213, |
| "grad_norm": 0.5234375, |
| "learning_rate": 9.5319746309701e-06, |
| "loss": 0.413, |
| "step": 325 |
| }, |
| { |
| "epoch": 0.5400981996726678, |
| "grad_norm": 0.515625, |
| "learning_rate": 9.516000525964716e-06, |
| "loss": 0.4404, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.5482815057283142, |
| "grad_norm": 0.51171875, |
| "learning_rate": 9.499777148747455e-06, |
| "loss": 0.4185, |
| "step": 335 |
| }, |
| { |
| "epoch": 0.5564648117839607, |
| "grad_norm": 0.5, |
| "learning_rate": 9.48330573754136e-06, |
| "loss": 0.4134, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.5646481178396072, |
| "grad_norm": 0.5234375, |
| "learning_rate": 9.466587549500274e-06, |
| "loss": 0.4305, |
| "step": 345 |
| }, |
| { |
| "epoch": 0.5728314238952537, |
| "grad_norm": 0.50390625, |
| "learning_rate": 9.449623860612879e-06, |
| "loss": 0.4108, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.5810147299509002, |
| "grad_norm": 0.50390625, |
| "learning_rate": 9.432415965605318e-06, |
| "loss": 0.4086, |
| "step": 355 |
| }, |
| { |
| "epoch": 0.5891980360065466, |
| "grad_norm": 0.5234375, |
| "learning_rate": 9.414965177842361e-06, |
| "loss": 0.4025, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.5973813420621932, |
| "grad_norm": 0.51953125, |
| "learning_rate": 9.397272829227187e-06, |
| "loss": 0.4191, |
| "step": 365 |
| }, |
| { |
| "epoch": 0.6055646481178396, |
| "grad_norm": 0.51171875, |
| "learning_rate": 9.379340270099708e-06, |
| "loss": 0.4117, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.613747954173486, |
| "grad_norm": 0.515625, |
| "learning_rate": 9.361168869133516e-06, |
| "loss": 0.418, |
| "step": 375 |
| }, |
| { |
| "epoch": 0.6219312602291326, |
| "grad_norm": 0.515625, |
| "learning_rate": 9.342760013231429e-06, |
| "loss": 0.4197, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.630114566284779, |
| "grad_norm": 0.51953125, |
| "learning_rate": 9.324115107419616e-06, |
| "loss": 0.4173, |
| "step": 385 |
| }, |
| { |
| "epoch": 0.6382978723404256, |
| "grad_norm": 0.5078125, |
| "learning_rate": 9.305235574740386e-06, |
| "loss": 0.4133, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.646481178396072, |
| "grad_norm": 0.53515625, |
| "learning_rate": 9.286122856143555e-06, |
| "loss": 0.4223, |
| "step": 395 |
| }, |
| { |
| "epoch": 0.6546644844517185, |
| "grad_norm": 0.494140625, |
| "learning_rate": 9.266778410376484e-06, |
| "loss": 0.4045, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.662847790507365, |
| "grad_norm": 0.51171875, |
| "learning_rate": 9.247203713872732e-06, |
| "loss": 0.4196, |
| "step": 405 |
| }, |
| { |
| "epoch": 0.6710310965630114, |
| "grad_norm": 0.5234375, |
| "learning_rate": 9.227400260639374e-06, |
| "loss": 0.3995, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.679214402618658, |
| "grad_norm": 0.53515625, |
| "learning_rate": 9.207369562142975e-06, |
| "loss": 0.4154, |
| "step": 415 |
| }, |
| { |
| "epoch": 0.6873977086743044, |
| "grad_norm": 0.5234375, |
| "learning_rate": 9.187113147194222e-06, |
| "loss": 0.4077, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.6955810147299509, |
| "grad_norm": 0.5078125, |
| "learning_rate": 9.166632561831252e-06, |
| "loss": 0.4012, |
| "step": 425 |
| }, |
| { |
| "epoch": 0.7037643207855974, |
| "grad_norm": 0.53515625, |
| "learning_rate": 9.145929369201646e-06, |
| "loss": 0.4007, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.7119476268412439, |
| "grad_norm": 0.494140625, |
| "learning_rate": 9.125005149443117e-06, |
| "loss": 0.412, |
| "step": 435 |
| }, |
| { |
| "epoch": 0.7201309328968903, |
| "grad_norm": 0.56640625, |
| "learning_rate": 9.103861499562925e-06, |
| "loss": 0.4165, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.7283142389525368, |
| "grad_norm": 0.5, |
| "learning_rate": 9.082500033315976e-06, |
| "loss": 0.4114, |
| "step": 445 |
| }, |
| { |
| "epoch": 0.7364975450081833, |
| "grad_norm": 0.515625, |
| "learning_rate": 9.060922381081658e-06, |
| "loss": 0.419, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.7446808510638298, |
| "grad_norm": 0.5234375, |
| "learning_rate": 9.039130189739405e-06, |
| "loss": 0.4209, |
| "step": 455 |
| }, |
| { |
| "epoch": 0.7528641571194763, |
| "grad_norm": 0.546875, |
| "learning_rate": 9.017125122543006e-06, |
| "loss": 0.4359, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.7610474631751227, |
| "grad_norm": 0.5546875, |
| "learning_rate": 8.994908858993647e-06, |
| "loss": 0.4162, |
| "step": 465 |
| }, |
| { |
| "epoch": 0.7692307692307693, |
| "grad_norm": 0.515625, |
| "learning_rate": 8.97248309471174e-06, |
| "loss": 0.3991, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.7774140752864157, |
| "grad_norm": 0.498046875, |
| "learning_rate": 8.949849541307505e-06, |
| "loss": 0.4087, |
| "step": 475 |
| }, |
| { |
| "epoch": 0.7855973813420621, |
| "grad_norm": 0.51953125, |
| "learning_rate": 8.927009926250324e-06, |
| "loss": 0.4053, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.7937806873977087, |
| "grad_norm": 0.53125, |
| "learning_rate": 8.903965992736903e-06, |
| "loss": 0.4016, |
| "step": 485 |
| }, |
| { |
| "epoch": 0.8019639934533551, |
| "grad_norm": 0.53515625, |
| "learning_rate": 8.880719499558226e-06, |
| "loss": 0.4128, |
| "step": 490 |
| }, |
| { |
| "epoch": 0.8101472995090017, |
| "grad_norm": 0.51171875, |
| "learning_rate": 8.85727222096532e-06, |
| "loss": 0.4146, |
| "step": 495 |
| }, |
| { |
| "epoch": 0.8183306055646481, |
| "grad_norm": 0.486328125, |
| "learning_rate": 8.833625946533826e-06, |
| "loss": 0.407, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.8183306055646481, |
| "eval_loss": 0.40063852071762085, |
| "eval_runtime": 5.182, |
| "eval_samples_per_second": 16.017, |
| "eval_steps_per_second": 16.017, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.8265139116202946, |
| "grad_norm": 0.51953125, |
| "learning_rate": 8.809782481027425e-06, |
| "loss": 0.4279, |
| "step": 505 |
| }, |
| { |
| "epoch": 0.8346972176759411, |
| "grad_norm": 0.5234375, |
| "learning_rate": 8.785743644260087e-06, |
| "loss": 0.4123, |
| "step": 510 |
| }, |
| { |
| "epoch": 0.8428805237315876, |
| "grad_norm": 0.50390625, |
| "learning_rate": 8.761511270957179e-06, |
| "loss": 0.3964, |
| "step": 515 |
| }, |
| { |
| "epoch": 0.851063829787234, |
| "grad_norm": 0.5390625, |
| "learning_rate": 8.737087210615434e-06, |
| "loss": 0.4143, |
| "step": 520 |
| }, |
| { |
| "epoch": 0.8592471358428805, |
| "grad_norm": 0.50390625, |
| "learning_rate": 8.71247332736178e-06, |
| "loss": 0.4108, |
| "step": 525 |
| }, |
| { |
| "epoch": 0.867430441898527, |
| "grad_norm": 0.5, |
| "learning_rate": 8.687671499811083e-06, |
| "loss": 0.4157, |
| "step": 530 |
| }, |
| { |
| "epoch": 0.8756137479541735, |
| "grad_norm": 0.4921875, |
| "learning_rate": 8.662683620922743e-06, |
| "loss": 0.4068, |
| "step": 535 |
| }, |
| { |
| "epoch": 0.88379705400982, |
| "grad_norm": 0.5078125, |
| "learning_rate": 8.637511597856234e-06, |
| "loss": 0.4154, |
| "step": 540 |
| }, |
| { |
| "epoch": 0.8919803600654664, |
| "grad_norm": 0.4921875, |
| "learning_rate": 8.612157351825536e-06, |
| "loss": 0.3982, |
| "step": 545 |
| }, |
| { |
| "epoch": 0.900163666121113, |
| "grad_norm": 0.515625, |
| "learning_rate": 8.586622817952504e-06, |
| "loss": 0.4016, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.9083469721767594, |
| "grad_norm": 0.5, |
| "learning_rate": 8.560909945119162e-06, |
| "loss": 0.405, |
| "step": 555 |
| }, |
| { |
| "epoch": 0.9165302782324058, |
| "grad_norm": 0.53515625, |
| "learning_rate": 8.53502069581898e-06, |
| "loss": 0.4107, |
| "step": 560 |
| }, |
| { |
| "epoch": 0.9247135842880524, |
| "grad_norm": 0.4921875, |
| "learning_rate": 8.50895704600707e-06, |
| "loss": 0.398, |
| "step": 565 |
| }, |
| { |
| "epoch": 0.9328968903436988, |
| "grad_norm": 0.50390625, |
| "learning_rate": 8.48272098494938e-06, |
| "loss": 0.3961, |
| "step": 570 |
| }, |
| { |
| "epoch": 0.9410801963993454, |
| "grad_norm": 0.49609375, |
| "learning_rate": 8.45631451507087e-06, |
| "loss": 0.4021, |
| "step": 575 |
| }, |
| { |
| "epoch": 0.9492635024549918, |
| "grad_norm": 0.51953125, |
| "learning_rate": 8.429739651802676e-06, |
| "loss": 0.4104, |
| "step": 580 |
| }, |
| { |
| "epoch": 0.9574468085106383, |
| "grad_norm": 0.498046875, |
| "learning_rate": 8.402998423428291e-06, |
| "loss": 0.411, |
| "step": 585 |
| }, |
| { |
| "epoch": 0.9656301145662848, |
| "grad_norm": 0.51953125, |
| "learning_rate": 8.376092870928752e-06, |
| "loss": 0.4266, |
| "step": 590 |
| }, |
| { |
| "epoch": 0.9738134206219312, |
| "grad_norm": 0.50390625, |
| "learning_rate": 8.349025047826873e-06, |
| "loss": 0.4103, |
| "step": 595 |
| }, |
| { |
| "epoch": 0.9819967266775778, |
| "grad_norm": 0.498046875, |
| "learning_rate": 8.321797020030504e-06, |
| "loss": 0.4233, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.9901800327332242, |
| "grad_norm": 0.515625, |
| "learning_rate": 8.294410865674864e-06, |
| "loss": 0.4273, |
| "step": 605 |
| }, |
| { |
| "epoch": 0.9983633387888707, |
| "grad_norm": 0.51171875, |
| "learning_rate": 8.266868674963924e-06, |
| "loss": 0.4179, |
| "step": 610 |
| }, |
| { |
| "epoch": 0.9983633387888707, |
| "eval_loss": 0.39774245023727417, |
| "eval_runtime": 5.08, |
| "eval_samples_per_second": 16.338, |
| "eval_steps_per_second": 16.338, |
| "step": 610 |
| }, |
| { |
| "epoch": 1.0065466448445173, |
| "grad_norm": 0.5, |
| "learning_rate": 8.23917255001088e-06, |
| "loss": 0.381, |
| "step": 615 |
| }, |
| { |
| "epoch": 1.0147299509001637, |
| "grad_norm": 0.498046875, |
| "learning_rate": 8.211324604677711e-06, |
| "loss": 0.4149, |
| "step": 620 |
| }, |
| { |
| "epoch": 1.0229132569558101, |
| "grad_norm": 0.51171875, |
| "learning_rate": 8.183326964413832e-06, |
| "loss": 0.4204, |
| "step": 625 |
| }, |
| { |
| "epoch": 1.0310965630114566, |
| "grad_norm": 0.51953125, |
| "learning_rate": 8.155181766093893e-06, |
| "loss": 0.4049, |
| "step": 630 |
| }, |
| { |
| "epoch": 1.039279869067103, |
| "grad_norm": 0.53515625, |
| "learning_rate": 8.12689115785467e-06, |
| "loss": 0.3967, |
| "step": 635 |
| }, |
| { |
| "epoch": 1.0474631751227497, |
| "grad_norm": 0.515625, |
| "learning_rate": 8.098457298931113e-06, |
| "loss": 0.3976, |
| "step": 640 |
| }, |
| { |
| "epoch": 1.055646481178396, |
| "grad_norm": 0.5234375, |
| "learning_rate": 8.069882359491555e-06, |
| "loss": 0.4098, |
| "step": 645 |
| }, |
| { |
| "epoch": 1.0638297872340425, |
| "grad_norm": 0.546875, |
| "learning_rate": 8.041168520472065e-06, |
| "loss": 0.3955, |
| "step": 650 |
| }, |
| { |
| "epoch": 1.072013093289689, |
| "grad_norm": 0.50390625, |
| "learning_rate": 8.012317973410001e-06, |
| "loss": 0.3852, |
| "step": 655 |
| }, |
| { |
| "epoch": 1.0801963993453354, |
| "grad_norm": 0.515625, |
| "learning_rate": 7.983332920276745e-06, |
| "loss": 0.4018, |
| "step": 660 |
| }, |
| { |
| "epoch": 1.088379705400982, |
| "grad_norm": 0.546875, |
| "learning_rate": 7.95421557330963e-06, |
| "loss": 0.3948, |
| "step": 665 |
| }, |
| { |
| "epoch": 1.0965630114566285, |
| "grad_norm": 0.50390625, |
| "learning_rate": 7.924968154843108e-06, |
| "loss": 0.3918, |
| "step": 670 |
| }, |
| { |
| "epoch": 1.104746317512275, |
| "grad_norm": 0.5625, |
| "learning_rate": 7.895592897139128e-06, |
| "loss": 0.4182, |
| "step": 675 |
| }, |
| { |
| "epoch": 1.1129296235679214, |
| "grad_norm": 0.51953125, |
| "learning_rate": 7.866092042216755e-06, |
| "loss": 0.3837, |
| "step": 680 |
| }, |
| { |
| "epoch": 1.121112929623568, |
| "grad_norm": 0.50390625, |
| "learning_rate": 7.836467841681066e-06, |
| "loss": 0.4191, |
| "step": 685 |
| }, |
| { |
| "epoch": 1.1292962356792144, |
| "grad_norm": 0.5234375, |
| "learning_rate": 7.806722556551292e-06, |
| "loss": 0.4124, |
| "step": 690 |
| }, |
| { |
| "epoch": 1.1374795417348609, |
| "grad_norm": 0.51171875, |
| "learning_rate": 7.776858457088249e-06, |
| "loss": 0.4093, |
| "step": 695 |
| }, |
| { |
| "epoch": 1.1456628477905073, |
| "grad_norm": 0.50390625, |
| "learning_rate": 7.746877822621059e-06, |
| "loss": 0.4067, |
| "step": 700 |
| }, |
| { |
| "epoch": 1.1538461538461537, |
| "grad_norm": 0.5078125, |
| "learning_rate": 7.716782941373201e-06, |
| "loss": 0.4061, |
| "step": 705 |
| }, |
| { |
| "epoch": 1.1620294599018004, |
| "grad_norm": 0.51953125, |
| "learning_rate": 7.68657611028785e-06, |
| "loss": 0.3947, |
| "step": 710 |
| }, |
| { |
| "epoch": 1.1702127659574468, |
| "grad_norm": 0.546875, |
| "learning_rate": 7.656259634852566e-06, |
| "loss": 0.4099, |
| "step": 715 |
| }, |
| { |
| "epoch": 1.1783960720130933, |
| "grad_norm": 0.515625, |
| "learning_rate": 7.625835828923344e-06, |
| "loss": 0.3978, |
| "step": 720 |
| }, |
| { |
| "epoch": 1.1865793780687397, |
| "grad_norm": 0.515625, |
| "learning_rate": 7.595307014548e-06, |
| "loss": 0.4063, |
| "step": 725 |
| }, |
| { |
| "epoch": 1.1947626841243864, |
| "grad_norm": 0.51953125, |
| "learning_rate": 7.5646755217889555e-06, |
| "loss": 0.3915, |
| "step": 730 |
| }, |
| { |
| "epoch": 1.2029459901800328, |
| "grad_norm": 0.5234375, |
| "learning_rate": 7.533943688545391e-06, |
| "loss": 0.4024, |
| "step": 735 |
| }, |
| { |
| "epoch": 1.2111292962356792, |
| "grad_norm": 0.51171875, |
| "learning_rate": 7.503113860374813e-06, |
| "loss": 0.4142, |
| "step": 740 |
| }, |
| { |
| "epoch": 1.2193126022913257, |
| "grad_norm": 0.53125, |
| "learning_rate": 7.472188390314029e-06, |
| "loss": 0.3906, |
| "step": 745 |
| }, |
| { |
| "epoch": 1.227495908346972, |
| "grad_norm": 0.54296875, |
| "learning_rate": 7.441169638699565e-06, |
| "loss": 0.3984, |
| "step": 750 |
| }, |
| { |
| "epoch": 1.2356792144026187, |
| "grad_norm": 0.51171875, |
| "learning_rate": 7.4100599729875045e-06, |
| "loss": 0.3901, |
| "step": 755 |
| }, |
| { |
| "epoch": 1.2405891980360066, |
| "eval_loss": 0.39578545093536377, |
| "eval_runtime": 5.0485, |
| "eval_samples_per_second": 16.441, |
| "eval_steps_per_second": 16.441, |
| "step": 758 |
| }, |
| { |
| "epoch": 1.2438625204582652, |
| "grad_norm": 0.53515625, |
| "learning_rate": 7.378861767572808e-06, |
| "loss": 0.3995, |
| "step": 760 |
| }, |
| { |
| "epoch": 1.2520458265139116, |
| "grad_norm": 0.54296875, |
| "learning_rate": 7.347577403608084e-06, |
| "loss": 0.3964, |
| "step": 765 |
| }, |
| { |
| "epoch": 1.260229132569558, |
| "grad_norm": 0.53515625, |
| "learning_rate": 7.316209268821852e-06, |
| "loss": 0.3901, |
| "step": 770 |
| }, |
| { |
| "epoch": 1.2684124386252047, |
| "grad_norm": 0.5546875, |
| "learning_rate": 7.284759757336304e-06, |
| "loss": 0.3886, |
| "step": 775 |
| }, |
| { |
| "epoch": 1.2765957446808511, |
| "grad_norm": 0.51171875, |
| "learning_rate": 7.25323126948458e-06, |
| "loss": 0.4042, |
| "step": 780 |
| }, |
| { |
| "epoch": 1.2847790507364976, |
| "grad_norm": 0.5234375, |
| "learning_rate": 7.221626211627557e-06, |
| "loss": 0.3994, |
| "step": 785 |
| }, |
| { |
| "epoch": 1.292962356792144, |
| "grad_norm": 0.53125, |
| "learning_rate": 7.1899469959702024e-06, |
| "loss": 0.3935, |
| "step": 790 |
| }, |
| { |
| "epoch": 1.3011456628477904, |
| "grad_norm": 0.50390625, |
| "learning_rate": 7.158196040377452e-06, |
| "loss": 0.3902, |
| "step": 795 |
| }, |
| { |
| "epoch": 1.3093289689034369, |
| "grad_norm": 0.5, |
| "learning_rate": 7.12637576818968e-06, |
| "loss": 0.3938, |
| "step": 800 |
| }, |
| { |
| "epoch": 1.3175122749590835, |
| "grad_norm": 0.5078125, |
| "learning_rate": 7.094488608037731e-06, |
| "loss": 0.3892, |
| "step": 805 |
| }, |
| { |
| "epoch": 1.32569558101473, |
| "grad_norm": 0.51171875, |
| "learning_rate": 7.062536993657574e-06, |
| "loss": 0.3957, |
| "step": 810 |
| }, |
| { |
| "epoch": 1.3338788870703764, |
| "grad_norm": 0.52734375, |
| "learning_rate": 7.0305233637045375e-06, |
| "loss": 0.4083, |
| "step": 815 |
| }, |
| { |
| "epoch": 1.342062193126023, |
| "grad_norm": 0.50390625, |
| "learning_rate": 6.998450161567189e-06, |
| "loss": 0.3917, |
| "step": 820 |
| }, |
| { |
| "epoch": 1.3502454991816695, |
| "grad_norm": 0.53125, |
| "learning_rate": 6.966319835180849e-06, |
| "loss": 0.3878, |
| "step": 825 |
| }, |
| { |
| "epoch": 1.358428805237316, |
| "grad_norm": 0.53125, |
| "learning_rate": 6.9341348368407505e-06, |
| "loss": 0.4083, |
| "step": 830 |
| }, |
| { |
| "epoch": 1.3666121112929623, |
| "grad_norm": 0.515625, |
| "learning_rate": 6.901897623014877e-06, |
| "loss": 0.3942, |
| "step": 835 |
| }, |
| { |
| "epoch": 1.3747954173486088, |
| "grad_norm": 0.5078125, |
| "learning_rate": 6.869610654156476e-06, |
| "loss": 0.3856, |
| "step": 840 |
| }, |
| { |
| "epoch": 1.3829787234042552, |
| "grad_norm": 0.51953125, |
| "learning_rate": 6.837276394516264e-06, |
| "loss": 0.4077, |
| "step": 845 |
| }, |
| { |
| "epoch": 1.3911620294599019, |
| "grad_norm": 0.52734375, |
| "learning_rate": 6.804897311954354e-06, |
| "loss": 0.3987, |
| "step": 850 |
| }, |
| { |
| "epoch": 1.3993453355155483, |
| "grad_norm": 0.53125, |
| "learning_rate": 6.77247587775189e-06, |
| "loss": 0.4003, |
| "step": 855 |
| }, |
| { |
| "epoch": 1.4075286415711947, |
| "grad_norm": 0.52734375, |
| "learning_rate": 6.7400145664224445e-06, |
| "loss": 0.4041, |
| "step": 860 |
| }, |
| { |
| "epoch": 1.4157119476268412, |
| "grad_norm": 0.5234375, |
| "learning_rate": 6.707515855523141e-06, |
| "loss": 0.4082, |
| "step": 865 |
| }, |
| { |
| "epoch": 1.4238952536824878, |
| "grad_norm": 0.5, |
| "learning_rate": 6.674982225465568e-06, |
| "loss": 0.3739, |
| "step": 870 |
| }, |
| { |
| "epoch": 1.4320785597381342, |
| "grad_norm": 0.5, |
| "learning_rate": 6.642416159326462e-06, |
| "loss": 0.3866, |
| "step": 875 |
| }, |
| { |
| "epoch": 1.4402618657937807, |
| "grad_norm": 0.5234375, |
| "learning_rate": 6.609820142658186e-06, |
| "loss": 0.4101, |
| "step": 880 |
| }, |
| { |
| "epoch": 1.4484451718494271, |
| "grad_norm": 0.5234375, |
| "learning_rate": 6.577196663299039e-06, |
| "loss": 0.3888, |
| "step": 885 |
| }, |
| { |
| "epoch": 1.4566284779050735, |
| "grad_norm": 0.50390625, |
| "learning_rate": 6.544548211183355e-06, |
| "loss": 0.3937, |
| "step": 890 |
| }, |
| { |
| "epoch": 1.4648117839607202, |
| "grad_norm": 0.53125, |
| "learning_rate": 6.511877278151479e-06, |
| "loss": 0.3989, |
| "step": 895 |
| }, |
| { |
| "epoch": 1.4729950900163666, |
| "grad_norm": 0.5078125, |
| "learning_rate": 6.479186357759575e-06, |
| "loss": 0.4157, |
| "step": 900 |
| }, |
| { |
| "epoch": 1.481178396072013, |
| "grad_norm": 0.52734375, |
| "learning_rate": 6.4464779450893086e-06, |
| "loss": 0.388, |
| "step": 905 |
| }, |
| { |
| "epoch": 1.4893617021276595, |
| "grad_norm": 0.50390625, |
| "learning_rate": 6.413754536557416e-06, |
| "loss": 0.3916, |
| "step": 910 |
| }, |
| { |
| "epoch": 1.4975450081833062, |
| "grad_norm": 0.52734375, |
| "learning_rate": 6.381018629725169e-06, |
| "loss": 0.4073, |
| "step": 915 |
| }, |
| { |
| "epoch": 1.5057283142389526, |
| "grad_norm": 0.53125, |
| "learning_rate": 6.34827272310775e-06, |
| "loss": 0.401, |
| "step": 920 |
| }, |
| { |
| "epoch": 1.513911620294599, |
| "grad_norm": 0.515625, |
| "learning_rate": 6.315519315983562e-06, |
| "loss": 0.3859, |
| "step": 925 |
| }, |
| { |
| "epoch": 1.5220949263502455, |
| "grad_norm": 0.51171875, |
| "learning_rate": 6.282760908203467e-06, |
| "loss": 0.3952, |
| "step": 930 |
| }, |
| { |
| "epoch": 1.530278232405892, |
| "grad_norm": 0.50390625, |
| "learning_rate": 6.25e-06, |
| "loss": 0.384, |
| "step": 935 |
| }, |
| { |
| "epoch": 1.5384615384615383, |
| "grad_norm": 0.53125, |
| "learning_rate": 6.2172390917965345e-06, |
| "loss": 0.3937, |
| "step": 940 |
| }, |
| { |
| "epoch": 1.546644844517185, |
| "grad_norm": 0.490234375, |
| "learning_rate": 6.18448068401644e-06, |
| "loss": 0.3811, |
| "step": 945 |
| }, |
| { |
| "epoch": 1.5548281505728314, |
| "grad_norm": 0.54296875, |
| "learning_rate": 6.151727276892252e-06, |
| "loss": 0.3966, |
| "step": 950 |
| }, |
| { |
| "epoch": 1.563011456628478, |
| "grad_norm": 0.53125, |
| "learning_rate": 6.118981370274833e-06, |
| "loss": 0.4045, |
| "step": 955 |
| }, |
| { |
| "epoch": 1.5711947626841245, |
| "grad_norm": 0.53125, |
| "learning_rate": 6.086245463442586e-06, |
| "loss": 0.3798, |
| "step": 960 |
| }, |
| { |
| "epoch": 1.579378068739771, |
| "grad_norm": 0.53515625, |
| "learning_rate": 6.0535220549106946e-06, |
| "loss": 0.4011, |
| "step": 965 |
| }, |
| { |
| "epoch": 1.5875613747954174, |
| "grad_norm": 0.53125, |
| "learning_rate": 6.020813642240426e-06, |
| "loss": 0.3887, |
| "step": 970 |
| }, |
| { |
| "epoch": 1.5957446808510638, |
| "grad_norm": 0.52734375, |
| "learning_rate": 5.988122721848521e-06, |
| "loss": 0.4012, |
| "step": 975 |
| }, |
| { |
| "epoch": 1.6039279869067102, |
| "grad_norm": 0.52734375, |
| "learning_rate": 5.955451788816645e-06, |
| "loss": 0.403, |
| "step": 980 |
| }, |
| { |
| "epoch": 1.6121112929623567, |
| "grad_norm": 0.52734375, |
| "learning_rate": 5.922803336700962e-06, |
| "loss": 0.3879, |
| "step": 985 |
| }, |
| { |
| "epoch": 1.6202945990180033, |
| "grad_norm": 0.55859375, |
| "learning_rate": 5.890179857341814e-06, |
| "loss": 0.4001, |
| "step": 990 |
| }, |
| { |
| "epoch": 1.6284779050736498, |
| "grad_norm": 0.51953125, |
| "learning_rate": 5.85758384067354e-06, |
| "loss": 0.4184, |
| "step": 995 |
| }, |
| { |
| "epoch": 1.6366612111292962, |
| "grad_norm": 0.51953125, |
| "learning_rate": 5.825017774534434e-06, |
| "loss": 0.4192, |
| "step": 1000 |
| }, |
| { |
| "epoch": 1.6366612111292962, |
| "eval_loss": 0.3935750126838684, |
| "eval_runtime": 5.036, |
| "eval_samples_per_second": 16.481, |
| "eval_steps_per_second": 16.481, |
| "step": 1000 |
| }, |
| { |
| "epoch": 1.6448445171849428, |
| "grad_norm": 0.53125, |
| "learning_rate": 5.7924841444768585e-06, |
| "loss": 0.3859, |
| "step": 1005 |
| }, |
| { |
| "epoch": 1.6530278232405893, |
| "grad_norm": 0.51171875, |
| "learning_rate": 5.759985433577557e-06, |
| "loss": 0.3973, |
| "step": 1010 |
| }, |
| { |
| "epoch": 1.6612111292962357, |
| "grad_norm": 0.52734375, |
| "learning_rate": 5.727524122248112e-06, |
| "loss": 0.4077, |
| "step": 1015 |
| }, |
| { |
| "epoch": 1.6693944353518821, |
| "grad_norm": 0.51171875, |
| "learning_rate": 5.695102688045649e-06, |
| "loss": 0.3916, |
| "step": 1020 |
| }, |
| { |
| "epoch": 1.6775777414075286, |
| "grad_norm": 0.50390625, |
| "learning_rate": 5.662723605483738e-06, |
| "loss": 0.3852, |
| "step": 1025 |
| }, |
| { |
| "epoch": 1.685761047463175, |
| "grad_norm": 0.546875, |
| "learning_rate": 5.6303893458435255e-06, |
| "loss": 0.4055, |
| "step": 1030 |
| }, |
| { |
| "epoch": 1.6939443535188214, |
| "grad_norm": 0.5390625, |
| "learning_rate": 5.598102376985124e-06, |
| "loss": 0.4046, |
| "step": 1035 |
| }, |
| { |
| "epoch": 1.702127659574468, |
| "grad_norm": 0.53515625, |
| "learning_rate": 5.565865163159252e-06, |
| "loss": 0.3868, |
| "step": 1040 |
| }, |
| { |
| "epoch": 1.7103109656301145, |
| "grad_norm": 0.51171875, |
| "learning_rate": 5.5336801648191525e-06, |
| "loss": 0.3929, |
| "step": 1045 |
| }, |
| { |
| "epoch": 1.7184942716857612, |
| "grad_norm": 0.52734375, |
| "learning_rate": 5.501549838432812e-06, |
| "loss": 0.4018, |
| "step": 1050 |
| }, |
| { |
| "epoch": 1.7266775777414076, |
| "grad_norm": 0.52734375, |
| "learning_rate": 5.469476636295463e-06, |
| "loss": 0.4009, |
| "step": 1055 |
| }, |
| { |
| "epoch": 1.734860883797054, |
| "grad_norm": 0.5390625, |
| "learning_rate": 5.437463006342427e-06, |
| "loss": 0.3959, |
| "step": 1060 |
| }, |
| { |
| "epoch": 1.7430441898527005, |
| "grad_norm": 0.51953125, |
| "learning_rate": 5.4055113919622714e-06, |
| "loss": 0.3921, |
| "step": 1065 |
| }, |
| { |
| "epoch": 1.751227495908347, |
| "grad_norm": 0.53125, |
| "learning_rate": 5.373624231810322e-06, |
| "loss": 0.3869, |
| "step": 1070 |
| }, |
| { |
| "epoch": 1.7594108019639934, |
| "grad_norm": 0.52734375, |
| "learning_rate": 5.341803959622549e-06, |
| "loss": 0.3866, |
| "step": 1075 |
| }, |
| { |
| "epoch": 1.7675941080196398, |
| "grad_norm": 0.52734375, |
| "learning_rate": 5.310053004029798e-06, |
| "loss": 0.3853, |
| "step": 1080 |
| }, |
| { |
| "epoch": 1.7757774140752864, |
| "grad_norm": 0.54296875, |
| "learning_rate": 5.278373788372444e-06, |
| "loss": 0.4065, |
| "step": 1085 |
| }, |
| { |
| "epoch": 1.7839607201309329, |
| "grad_norm": 0.49609375, |
| "learning_rate": 5.246768730515424e-06, |
| "loss": 0.3839, |
| "step": 1090 |
| }, |
| { |
| "epoch": 1.7921440261865795, |
| "grad_norm": 0.515625, |
| "learning_rate": 5.2152402426636975e-06, |
| "loss": 0.3914, |
| "step": 1095 |
| }, |
| { |
| "epoch": 1.800327332242226, |
| "grad_norm": 0.53125, |
| "learning_rate": 5.183790731178151e-06, |
| "loss": 0.3846, |
| "step": 1100 |
| }, |
| { |
| "epoch": 1.8085106382978724, |
| "grad_norm": 0.51171875, |
| "learning_rate": 5.152422596391917e-06, |
| "loss": 0.3841, |
| "step": 1105 |
| }, |
| { |
| "epoch": 1.8166939443535188, |
| "grad_norm": 0.5078125, |
| "learning_rate": 5.121138232427193e-06, |
| "loss": 0.3932, |
| "step": 1110 |
| }, |
| { |
| "epoch": 1.8248772504091653, |
| "grad_norm": 0.52734375, |
| "learning_rate": 5.089940027012498e-06, |
| "loss": 0.4066, |
| "step": 1115 |
| }, |
| { |
| "epoch": 1.8330605564648117, |
| "grad_norm": 0.53515625, |
| "learning_rate": 5.058830361300437e-06, |
| "loss": 0.3977, |
| "step": 1120 |
| }, |
| { |
| "epoch": 1.8412438625204581, |
| "grad_norm": 0.54296875, |
| "learning_rate": 5.027811609685972e-06, |
| "loss": 0.4038, |
| "step": 1125 |
| }, |
| { |
| "epoch": 1.8494271685761048, |
| "grad_norm": 0.51953125, |
| "learning_rate": 4.9968861396251884e-06, |
| "loss": 0.3984, |
| "step": 1130 |
| }, |
| { |
| "epoch": 1.8576104746317512, |
| "grad_norm": 0.5390625, |
| "learning_rate": 4.96605631145461e-06, |
| "loss": 0.3976, |
| "step": 1135 |
| }, |
| { |
| "epoch": 1.8657937806873979, |
| "grad_norm": 0.53515625, |
| "learning_rate": 4.935324478211047e-06, |
| "loss": 0.4076, |
| "step": 1140 |
| }, |
| { |
| "epoch": 1.8739770867430443, |
| "grad_norm": 0.5234375, |
| "learning_rate": 4.9046929854520014e-06, |
| "loss": 0.3916, |
| "step": 1145 |
| }, |
| { |
| "epoch": 1.8821603927986907, |
| "grad_norm": 0.51171875, |
| "learning_rate": 4.8741641710766595e-06, |
| "loss": 0.3976, |
| "step": 1150 |
| }, |
| { |
| "epoch": 1.8903436988543372, |
| "grad_norm": 0.54296875, |
| "learning_rate": 4.843740365147435e-06, |
| "loss": 0.4034, |
| "step": 1155 |
| }, |
| { |
| "epoch": 1.8985270049099836, |
| "grad_norm": 0.5, |
| "learning_rate": 4.8134238897121515e-06, |
| "loss": 0.4105, |
| "step": 1160 |
| }, |
| { |
| "epoch": 1.90671031096563, |
| "grad_norm": 0.55078125, |
| "learning_rate": 4.783217058626799e-06, |
| "loss": 0.4022, |
| "step": 1165 |
| }, |
| { |
| "epoch": 1.9148936170212765, |
| "grad_norm": 0.5625, |
| "learning_rate": 4.753122177378941e-06, |
| "loss": 0.3885, |
| "step": 1170 |
| }, |
| { |
| "epoch": 1.9230769230769231, |
| "grad_norm": 0.5234375, |
| "learning_rate": 4.723141542911755e-06, |
| "loss": 0.4192, |
| "step": 1175 |
| }, |
| { |
| "epoch": 1.9312602291325696, |
| "grad_norm": 0.5, |
| "learning_rate": 4.693277443448709e-06, |
| "loss": 0.3854, |
| "step": 1180 |
| }, |
| { |
| "epoch": 1.939443535188216, |
| "grad_norm": 0.5546875, |
| "learning_rate": 4.663532158318936e-06, |
| "loss": 0.4069, |
| "step": 1185 |
| }, |
| { |
| "epoch": 1.9476268412438626, |
| "grad_norm": 0.5078125, |
| "learning_rate": 4.633907957783249e-06, |
| "loss": 0.4069, |
| "step": 1190 |
| }, |
| { |
| "epoch": 1.955810147299509, |
| "grad_norm": 0.5546875, |
| "learning_rate": 4.604407102860875e-06, |
| "loss": 0.4052, |
| "step": 1195 |
| }, |
| { |
| "epoch": 1.9639934533551555, |
| "grad_norm": 0.54296875, |
| "learning_rate": 4.575031845156893e-06, |
| "loss": 0.4004, |
| "step": 1200 |
| }, |
| { |
| "epoch": 1.972176759410802, |
| "grad_norm": 0.51953125, |
| "learning_rate": 4.545784426690371e-06, |
| "loss": 0.3926, |
| "step": 1205 |
| }, |
| { |
| "epoch": 1.9803600654664484, |
| "grad_norm": 0.5078125, |
| "learning_rate": 4.516667079723257e-06, |
| "loss": 0.3958, |
| "step": 1210 |
| }, |
| { |
| "epoch": 1.9885433715220948, |
| "grad_norm": 0.515625, |
| "learning_rate": 4.48768202659e-06, |
| "loss": 0.401, |
| "step": 1215 |
| }, |
| { |
| "epoch": 1.9967266775777412, |
| "grad_norm": 0.53125, |
| "learning_rate": 4.458831479527936e-06, |
| "loss": 0.392, |
| "step": 1220 |
| }, |
| { |
| "epoch": 1.9967266775777412, |
| "eval_loss": 0.3923807144165039, |
| "eval_runtime": 5.0473, |
| "eval_samples_per_second": 16.445, |
| "eval_steps_per_second": 16.445, |
| "step": 1220 |
| }, |
| { |
| "epoch": 2.0049099836333877, |
| "grad_norm": 0.49609375, |
| "learning_rate": 4.430117640508447e-06, |
| "loss": 0.3889, |
| "step": 1225 |
| }, |
| { |
| "epoch": 2.0130932896890346, |
| "grad_norm": 0.53125, |
| "learning_rate": 4.401542701068887e-06, |
| "loss": 0.3813, |
| "step": 1230 |
| }, |
| { |
| "epoch": 2.021276595744681, |
| "grad_norm": 0.51171875, |
| "learning_rate": 4.373108842145332e-06, |
| "loss": 0.3885, |
| "step": 1235 |
| }, |
| { |
| "epoch": 2.0294599018003274, |
| "grad_norm": 0.49609375, |
| "learning_rate": 4.344818233906108e-06, |
| "loss": 0.388, |
| "step": 1240 |
| }, |
| { |
| "epoch": 2.037643207855974, |
| "grad_norm": 0.49609375, |
| "learning_rate": 4.316673035586168e-06, |
| "loss": 0.3848, |
| "step": 1245 |
| }, |
| { |
| "epoch": 2.0458265139116203, |
| "grad_norm": 0.5078125, |
| "learning_rate": 4.288675395322291e-06, |
| "loss": 0.3885, |
| "step": 1250 |
| }, |
| { |
| "epoch": 2.0540098199672667, |
| "grad_norm": 0.5546875, |
| "learning_rate": 4.26082744998912e-06, |
| "loss": 0.382, |
| "step": 1255 |
| }, |
| { |
| "epoch": 2.062193126022913, |
| "grad_norm": 0.5234375, |
| "learning_rate": 4.233131325036077e-06, |
| "loss": 0.38, |
| "step": 1260 |
| }, |
| { |
| "epoch": 2.0703764320785596, |
| "grad_norm": 0.5234375, |
| "learning_rate": 4.205589134325138e-06, |
| "loss": 0.3926, |
| "step": 1265 |
| }, |
| { |
| "epoch": 2.078559738134206, |
| "grad_norm": 0.515625, |
| "learning_rate": 4.178202979969499e-06, |
| "loss": 0.3936, |
| "step": 1270 |
| }, |
| { |
| "epoch": 2.086743044189853, |
| "grad_norm": 0.53125, |
| "learning_rate": 4.15097495217313e-06, |
| "loss": 0.3862, |
| "step": 1275 |
| }, |
| { |
| "epoch": 2.0949263502454993, |
| "grad_norm": 0.5546875, |
| "learning_rate": 4.1239071290712485e-06, |
| "loss": 0.3968, |
| "step": 1280 |
| }, |
| { |
| "epoch": 2.1031096563011458, |
| "grad_norm": 0.52734375, |
| "learning_rate": 4.0970015765717105e-06, |
| "loss": 0.4087, |
| "step": 1285 |
| }, |
| { |
| "epoch": 2.111292962356792, |
| "grad_norm": 0.5078125, |
| "learning_rate": 4.070260348197324e-06, |
| "loss": 0.3784, |
| "step": 1290 |
| }, |
| { |
| "epoch": 2.1194762684124386, |
| "grad_norm": 0.51953125, |
| "learning_rate": 4.043685484929132e-06, |
| "loss": 0.3852, |
| "step": 1295 |
| }, |
| { |
| "epoch": 2.127659574468085, |
| "grad_norm": 0.50390625, |
| "learning_rate": 4.0172790150506215e-06, |
| "loss": 0.3756, |
| "step": 1300 |
| }, |
| { |
| "epoch": 2.1358428805237315, |
| "grad_norm": 0.5234375, |
| "learning_rate": 3.991042953992931e-06, |
| "loss": 0.3841, |
| "step": 1305 |
| }, |
| { |
| "epoch": 2.144026186579378, |
| "grad_norm": 0.52734375, |
| "learning_rate": 3.96497930418102e-06, |
| "loss": 0.3847, |
| "step": 1310 |
| }, |
| { |
| "epoch": 2.1522094926350244, |
| "grad_norm": 0.515625, |
| "learning_rate": 3.939090054880839e-06, |
| "loss": 0.3826, |
| "step": 1315 |
| }, |
| { |
| "epoch": 2.160392798690671, |
| "grad_norm": 0.5078125, |
| "learning_rate": 3.913377182047498e-06, |
| "loss": 0.3882, |
| "step": 1320 |
| }, |
| { |
| "epoch": 2.1685761047463177, |
| "grad_norm": 0.53515625, |
| "learning_rate": 3.887842648174465e-06, |
| "loss": 0.3998, |
| "step": 1325 |
| }, |
| { |
| "epoch": 2.176759410801964, |
| "grad_norm": 0.51171875, |
| "learning_rate": 3.862488402143767e-06, |
| "loss": 0.3782, |
| "step": 1330 |
| }, |
| { |
| "epoch": 2.1849427168576105, |
| "grad_norm": 0.53125, |
| "learning_rate": 3.8373163790772595e-06, |
| "loss": 0.3937, |
| "step": 1335 |
| }, |
| { |
| "epoch": 2.193126022913257, |
| "grad_norm": 0.51953125, |
| "learning_rate": 3.812328500188919e-06, |
| "loss": 0.3967, |
| "step": 1340 |
| }, |
| { |
| "epoch": 2.2013093289689034, |
| "grad_norm": 0.54296875, |
| "learning_rate": 3.78752667263822e-06, |
| "loss": 0.3856, |
| "step": 1345 |
| }, |
| { |
| "epoch": 2.20949263502455, |
| "grad_norm": 0.5078125, |
| "learning_rate": 3.762912789384568e-06, |
| "loss": 0.377, |
| "step": 1350 |
| }, |
| { |
| "epoch": 2.2176759410801963, |
| "grad_norm": 0.53125, |
| "learning_rate": 3.738488729042821e-06, |
| "loss": 0.3853, |
| "step": 1355 |
| }, |
| { |
| "epoch": 2.2258592471358427, |
| "grad_norm": 0.5234375, |
| "learning_rate": 3.7142563557399145e-06, |
| "loss": 0.386, |
| "step": 1360 |
| }, |
| { |
| "epoch": 2.2340425531914896, |
| "grad_norm": 0.51953125, |
| "learning_rate": 3.6902175189725764e-06, |
| "loss": 0.3881, |
| "step": 1365 |
| }, |
| { |
| "epoch": 2.242225859247136, |
| "grad_norm": 0.51953125, |
| "learning_rate": 3.666374053466175e-06, |
| "loss": 0.3934, |
| "step": 1370 |
| }, |
| { |
| "epoch": 2.2504091653027825, |
| "grad_norm": 0.53515625, |
| "learning_rate": 3.642727779034681e-06, |
| "loss": 0.391, |
| "step": 1375 |
| }, |
| { |
| "epoch": 2.258592471358429, |
| "grad_norm": 0.5078125, |
| "learning_rate": 3.6192805004417732e-06, |
| "loss": 0.3818, |
| "step": 1380 |
| }, |
| { |
| "epoch": 2.2667757774140753, |
| "grad_norm": 0.498046875, |
| "learning_rate": 3.5960340072630984e-06, |
| "loss": 0.3861, |
| "step": 1385 |
| }, |
| { |
| "epoch": 2.2749590834697218, |
| "grad_norm": 0.51171875, |
| "learning_rate": 3.572990073749678e-06, |
| "loss": 0.3989, |
| "step": 1390 |
| }, |
| { |
| "epoch": 2.283142389525368, |
| "grad_norm": 0.51171875, |
| "learning_rate": 3.550150458692497e-06, |
| "loss": 0.3973, |
| "step": 1395 |
| }, |
| { |
| "epoch": 2.2913256955810146, |
| "grad_norm": 0.51171875, |
| "learning_rate": 3.527516905288261e-06, |
| "loss": 0.3724, |
| "step": 1400 |
| }, |
| { |
| "epoch": 2.299509001636661, |
| "grad_norm": 0.51953125, |
| "learning_rate": 3.505091141006354e-06, |
| "loss": 0.3882, |
| "step": 1405 |
| }, |
| { |
| "epoch": 2.3076923076923075, |
| "grad_norm": 0.5234375, |
| "learning_rate": 3.4828748774569967e-06, |
| "loss": 0.3945, |
| "step": 1410 |
| }, |
| { |
| "epoch": 2.3158756137479544, |
| "grad_norm": 0.55078125, |
| "learning_rate": 3.460869810260595e-06, |
| "loss": 0.3919, |
| "step": 1415 |
| }, |
| { |
| "epoch": 2.324058919803601, |
| "grad_norm": 0.52734375, |
| "learning_rate": 3.4390776189183435e-06, |
| "loss": 0.3875, |
| "step": 1420 |
| }, |
| { |
| "epoch": 2.3322422258592472, |
| "grad_norm": 0.52734375, |
| "learning_rate": 3.4174999666840257e-06, |
| "loss": 0.3817, |
| "step": 1425 |
| }, |
| { |
| "epoch": 2.3404255319148937, |
| "grad_norm": 0.51171875, |
| "learning_rate": 3.396138500437076e-06, |
| "loss": 0.392, |
| "step": 1430 |
| }, |
| { |
| "epoch": 2.34860883797054, |
| "grad_norm": 0.5, |
| "learning_rate": 3.374994850556884e-06, |
| "loss": 0.3902, |
| "step": 1435 |
| }, |
| { |
| "epoch": 2.3567921440261865, |
| "grad_norm": 0.53515625, |
| "learning_rate": 3.354070630798355e-06, |
| "loss": 0.3935, |
| "step": 1440 |
| }, |
| { |
| "epoch": 2.364975450081833, |
| "grad_norm": 0.54296875, |
| "learning_rate": 3.3333674381687476e-06, |
| "loss": 0.3904, |
| "step": 1445 |
| }, |
| { |
| "epoch": 2.3731587561374794, |
| "grad_norm": 0.53125, |
| "learning_rate": 3.312886852805779e-06, |
| "loss": 0.403, |
| "step": 1450 |
| }, |
| { |
| "epoch": 2.381342062193126, |
| "grad_norm": 0.5234375, |
| "learning_rate": 3.292630437857026e-06, |
| "loss": 0.4007, |
| "step": 1455 |
| }, |
| { |
| "epoch": 2.3895253682487727, |
| "grad_norm": 0.53515625, |
| "learning_rate": 3.2725997393606266e-06, |
| "loss": 0.3833, |
| "step": 1460 |
| }, |
| { |
| "epoch": 2.397708674304419, |
| "grad_norm": 0.51953125, |
| "learning_rate": 3.2527962861272695e-06, |
| "loss": 0.3862, |
| "step": 1465 |
| }, |
| { |
| "epoch": 2.4058919803600656, |
| "grad_norm": 0.5078125, |
| "learning_rate": 3.2332215896235176e-06, |
| "loss": 0.3923, |
| "step": 1470 |
| }, |
| { |
| "epoch": 2.414075286415712, |
| "grad_norm": 0.50390625, |
| "learning_rate": 3.2138771438564465e-06, |
| "loss": 0.379, |
| "step": 1475 |
| }, |
| { |
| "epoch": 2.4222585924713584, |
| "grad_norm": 0.546875, |
| "learning_rate": 3.194764425259615e-06, |
| "loss": 0.3766, |
| "step": 1480 |
| }, |
| { |
| "epoch": 2.430441898527005, |
| "grad_norm": 0.50390625, |
| "learning_rate": 3.1758848925803846e-06, |
| "loss": 0.3904, |
| "step": 1485 |
| }, |
| { |
| "epoch": 2.4386252045826513, |
| "grad_norm": 0.51953125, |
| "learning_rate": 3.1572399867685727e-06, |
| "loss": 0.3922, |
| "step": 1490 |
| }, |
| { |
| "epoch": 2.4468085106382977, |
| "grad_norm": 0.5625, |
| "learning_rate": 3.138831130866484e-06, |
| "loss": 0.392, |
| "step": 1495 |
| }, |
| { |
| "epoch": 2.454991816693944, |
| "grad_norm": 0.53125, |
| "learning_rate": 3.1206597299002948e-06, |
| "loss": 0.3825, |
| "step": 1500 |
| }, |
| { |
| "epoch": 2.454991816693944, |
| "eval_loss": 0.39173567295074463, |
| "eval_runtime": 5.0609, |
| "eval_samples_per_second": 16.4, |
| "eval_steps_per_second": 16.4, |
| "step": 1500 |
| }, |
| { |
| "epoch": 2.4631751227495906, |
| "grad_norm": 0.53125, |
| "learning_rate": 3.1027271707728147e-06, |
| "loss": 0.3988, |
| "step": 1505 |
| }, |
| { |
| "epoch": 2.4713584288052375, |
| "grad_norm": 0.53125, |
| "learning_rate": 3.0850348221576405e-06, |
| "loss": 0.3944, |
| "step": 1510 |
| }, |
| { |
| "epoch": 2.479541734860884, |
| "grad_norm": 0.51953125, |
| "learning_rate": 3.067584034394684e-06, |
| "loss": 0.394, |
| "step": 1515 |
| }, |
| { |
| "epoch": 2.4877250409165304, |
| "grad_norm": 0.51953125, |
| "learning_rate": 3.050376139387121e-06, |
| "loss": 0.3959, |
| "step": 1520 |
| }, |
| { |
| "epoch": 2.495908346972177, |
| "grad_norm": 0.5078125, |
| "learning_rate": 3.0334124504997275e-06, |
| "loss": 0.384, |
| "step": 1525 |
| }, |
| { |
| "epoch": 2.504091653027823, |
| "grad_norm": 0.53125, |
| "learning_rate": 3.016694262458642e-06, |
| "loss": 0.384, |
| "step": 1530 |
| }, |
| { |
| "epoch": 2.5122749590834696, |
| "grad_norm": 0.50390625, |
| "learning_rate": 3.0002228512525485e-06, |
| "loss": 0.3892, |
| "step": 1535 |
| }, |
| { |
| "epoch": 2.520458265139116, |
| "grad_norm": 0.51953125, |
| "learning_rate": 2.983999474035285e-06, |
| "loss": 0.3921, |
| "step": 1540 |
| }, |
| { |
| "epoch": 2.528641571194763, |
| "grad_norm": 0.52734375, |
| "learning_rate": 2.968025369029902e-06, |
| "loss": 0.4082, |
| "step": 1545 |
| }, |
| { |
| "epoch": 2.5368248772504094, |
| "grad_norm": 0.52734375, |
| "learning_rate": 2.9523017554341465e-06, |
| "loss": 0.3814, |
| "step": 1550 |
| }, |
| { |
| "epoch": 2.545008183306056, |
| "grad_norm": 0.53125, |
| "learning_rate": 2.9368298333274148e-06, |
| "loss": 0.386, |
| "step": 1555 |
| }, |
| { |
| "epoch": 2.5531914893617023, |
| "grad_norm": 0.51953125, |
| "learning_rate": 2.921610783579157e-06, |
| "loss": 0.3794, |
| "step": 1560 |
| }, |
| { |
| "epoch": 2.5613747954173487, |
| "grad_norm": 0.5234375, |
| "learning_rate": 2.9066457677587488e-06, |
| "loss": 0.3805, |
| "step": 1565 |
| }, |
| { |
| "epoch": 2.569558101472995, |
| "grad_norm": 0.53515625, |
| "learning_rate": 2.891935928046839e-06, |
| "loss": 0.4039, |
| "step": 1570 |
| }, |
| { |
| "epoch": 2.5777414075286416, |
| "grad_norm": 0.5234375, |
| "learning_rate": 2.8774823871481695e-06, |
| "loss": 0.3814, |
| "step": 1575 |
| }, |
| { |
| "epoch": 2.585924713584288, |
| "grad_norm": 0.53515625, |
| "learning_rate": 2.86328624820589e-06, |
| "loss": 0.3798, |
| "step": 1580 |
| }, |
| { |
| "epoch": 2.5941080196399344, |
| "grad_norm": 0.54296875, |
| "learning_rate": 2.8493485947173643e-06, |
| "loss": 0.3909, |
| "step": 1585 |
| }, |
| { |
| "epoch": 2.602291325695581, |
| "grad_norm": 0.53125, |
| "learning_rate": 2.8356704904514683e-06, |
| "loss": 0.3958, |
| "step": 1590 |
| }, |
| { |
| "epoch": 2.6104746317512273, |
| "grad_norm": 0.54296875, |
| "learning_rate": 2.8222529793674055e-06, |
| "loss": 0.3929, |
| "step": 1595 |
| }, |
| { |
| "epoch": 2.6186579378068737, |
| "grad_norm": 0.5234375, |
| "learning_rate": 2.8090970855350252e-06, |
| "loss": 0.4019, |
| "step": 1600 |
| }, |
| { |
| "epoch": 2.6268412438625206, |
| "grad_norm": 0.51171875, |
| "learning_rate": 2.7962038130566616e-06, |
| "loss": 0.3837, |
| "step": 1605 |
| }, |
| { |
| "epoch": 2.635024549918167, |
| "grad_norm": 0.51953125, |
| "learning_rate": 2.783574145990501e-06, |
| "loss": 0.4116, |
| "step": 1610 |
| }, |
| { |
| "epoch": 2.6432078559738135, |
| "grad_norm": 0.51171875, |
| "learning_rate": 2.7712090482754683e-06, |
| "loss": 0.386, |
| "step": 1615 |
| }, |
| { |
| "epoch": 2.65139116202946, |
| "grad_norm": 0.5234375, |
| "learning_rate": 2.759109463657666e-06, |
| "loss": 0.3837, |
| "step": 1620 |
| }, |
| { |
| "epoch": 2.6595744680851063, |
| "grad_norm": 0.53515625, |
| "learning_rate": 2.7472763156183346e-06, |
| "loss": 0.4019, |
| "step": 1625 |
| }, |
| { |
| "epoch": 2.6677577741407528, |
| "grad_norm": 0.50390625, |
| "learning_rate": 2.735710507303374e-06, |
| "loss": 0.4055, |
| "step": 1630 |
| }, |
| { |
| "epoch": 2.675941080196399, |
| "grad_norm": 0.51953125, |
| "learning_rate": 2.7244129214544123e-06, |
| "loss": 0.3838, |
| "step": 1635 |
| }, |
| { |
| "epoch": 2.684124386252046, |
| "grad_norm": 0.53515625, |
| "learning_rate": 2.7133844203414305e-06, |
| "loss": 0.3838, |
| "step": 1640 |
| }, |
| { |
| "epoch": 2.6923076923076925, |
| "grad_norm": 0.55078125, |
| "learning_rate": 2.7026258456969538e-06, |
| "loss": 0.3999, |
| "step": 1645 |
| }, |
| { |
| "epoch": 2.700490998363339, |
| "grad_norm": 0.50390625, |
| "learning_rate": 2.6921380186518042e-06, |
| "loss": 0.3993, |
| "step": 1650 |
| }, |
| { |
| "epoch": 2.7086743044189854, |
| "grad_norm": 0.5078125, |
| "learning_rate": 2.6819217396724305e-06, |
| "loss": 0.3909, |
| "step": 1655 |
| }, |
| { |
| "epoch": 2.716857610474632, |
| "grad_norm": 0.52734375, |
| "learning_rate": 2.671977788499817e-06, |
| "loss": 0.3932, |
| "step": 1660 |
| }, |
| { |
| "epoch": 2.7250409165302782, |
| "grad_norm": 0.53125, |
| "learning_rate": 2.6623069240899642e-06, |
| "loss": 0.398, |
| "step": 1665 |
| }, |
| { |
| "epoch": 2.7332242225859247, |
| "grad_norm": 0.5078125, |
| "learning_rate": 2.6529098845559703e-06, |
| "loss": 0.3899, |
| "step": 1670 |
| }, |
| { |
| "epoch": 2.741407528641571, |
| "grad_norm": 0.52734375, |
| "learning_rate": 2.6437873871116903e-06, |
| "loss": 0.385, |
| "step": 1675 |
| }, |
| { |
| "epoch": 2.7495908346972175, |
| "grad_norm": 0.5234375, |
| "learning_rate": 2.6349401280169985e-06, |
| "loss": 0.3903, |
| "step": 1680 |
| }, |
| { |
| "epoch": 2.757774140752864, |
| "grad_norm": 0.51953125, |
| "learning_rate": 2.6263687825246463e-06, |
| "loss": 0.3708, |
| "step": 1685 |
| }, |
| { |
| "epoch": 2.7659574468085104, |
| "grad_norm": 0.51171875, |
| "learning_rate": 2.6180740048287274e-06, |
| "loss": 0.3821, |
| "step": 1690 |
| }, |
| { |
| "epoch": 2.774140752864157, |
| "grad_norm": 0.55859375, |
| "learning_rate": 2.610056428014743e-06, |
| "loss": 0.4001, |
| "step": 1695 |
| }, |
| { |
| "epoch": 2.7823240589198037, |
| "grad_norm": 0.546875, |
| "learning_rate": 2.6023166640112875e-06, |
| "loss": 0.4013, |
| "step": 1700 |
| }, |
| { |
| "epoch": 2.79050736497545, |
| "grad_norm": 0.52734375, |
| "learning_rate": 2.594855303543338e-06, |
| "loss": 0.3969, |
| "step": 1705 |
| }, |
| { |
| "epoch": 2.7986906710310966, |
| "grad_norm": 0.5703125, |
| "learning_rate": 2.587672916087175e-06, |
| "loss": 0.4152, |
| "step": 1710 |
| }, |
| { |
| "epoch": 2.806873977086743, |
| "grad_norm": 0.52734375, |
| "learning_rate": 2.5807700498269134e-06, |
| "loss": 0.3875, |
| "step": 1715 |
| }, |
| { |
| "epoch": 2.8150572831423895, |
| "grad_norm": 0.51171875, |
| "learning_rate": 2.574147231612665e-06, |
| "loss": 0.3951, |
| "step": 1720 |
| }, |
| { |
| "epoch": 2.823240589198036, |
| "grad_norm": 0.51953125, |
| "learning_rate": 2.5678049669203252e-06, |
| "loss": 0.404, |
| "step": 1725 |
| }, |
| { |
| "epoch": 2.8314238952536823, |
| "grad_norm": 0.5390625, |
| "learning_rate": 2.561743739812998e-06, |
| "loss": 0.3845, |
| "step": 1730 |
| }, |
| { |
| "epoch": 2.839607201309329, |
| "grad_norm": 0.5234375, |
| "learning_rate": 2.5559640129040464e-06, |
| "loss": 0.3885, |
| "step": 1735 |
| }, |
| { |
| "epoch": 2.8477905073649756, |
| "grad_norm": 0.515625, |
| "learning_rate": 2.550466227321786e-06, |
| "loss": 0.3967, |
| "step": 1740 |
| }, |
| { |
| "epoch": 2.855973813420622, |
| "grad_norm": 0.515625, |
| "learning_rate": 2.545250802675816e-06, |
| "loss": 0.3865, |
| "step": 1745 |
| }, |
| { |
| "epoch": 2.8641571194762685, |
| "grad_norm": 0.51953125, |
| "learning_rate": 2.540318137024994e-06, |
| "loss": 0.3873, |
| "step": 1750 |
| }, |
| { |
| "epoch": 2.872340425531915, |
| "grad_norm": 0.5546875, |
| "learning_rate": 2.5356686068470537e-06, |
| "loss": 0.3912, |
| "step": 1755 |
| }, |
| { |
| "epoch": 2.8805237315875614, |
| "grad_norm": 0.53515625, |
| "learning_rate": 2.5313025670098725e-06, |
| "loss": 0.3949, |
| "step": 1760 |
| }, |
| { |
| "epoch": 2.888707037643208, |
| "grad_norm": 0.546875, |
| "learning_rate": 2.5272203507443836e-06, |
| "loss": 0.3985, |
| "step": 1765 |
| }, |
| { |
| "epoch": 2.8968903436988542, |
| "grad_norm": 0.53125, |
| "learning_rate": 2.523422269619147e-06, |
| "loss": 0.3831, |
| "step": 1770 |
| }, |
| { |
| "epoch": 2.9050736497545007, |
| "grad_norm": 0.54296875, |
| "learning_rate": 2.5199086135165664e-06, |
| "loss": 0.3916, |
| "step": 1775 |
| }, |
| { |
| "epoch": 2.913256955810147, |
| "grad_norm": 0.51171875, |
| "learning_rate": 2.516679650610765e-06, |
| "loss": 0.4102, |
| "step": 1780 |
| }, |
| { |
| "epoch": 2.9214402618657935, |
| "grad_norm": 0.55859375, |
| "learning_rate": 2.5137356273471183e-06, |
| "loss": 0.407, |
| "step": 1785 |
| }, |
| { |
| "epoch": 2.9296235679214404, |
| "grad_norm": 0.53125, |
| "learning_rate": 2.511076768423443e-06, |
| "loss": 0.4131, |
| "step": 1790 |
| }, |
| { |
| "epoch": 2.937806873977087, |
| "grad_norm": 0.53125, |
| "learning_rate": 2.508703276772852e-06, |
| "loss": 0.3914, |
| "step": 1795 |
| }, |
| { |
| "epoch": 2.9459901800327333, |
| "grad_norm": 0.53125, |
| "learning_rate": 2.506615333548257e-06, |
| "loss": 0.4093, |
| "step": 1800 |
| }, |
| { |
| "epoch": 2.9541734860883797, |
| "grad_norm": 0.5, |
| "learning_rate": 2.5048130981085524e-06, |
| "loss": 0.3938, |
| "step": 1805 |
| }, |
| { |
| "epoch": 2.962356792144026, |
| "grad_norm": 0.53125, |
| "learning_rate": 2.5032967080064435e-06, |
| "loss": 0.4007, |
| "step": 1810 |
| }, |
| { |
| "epoch": 2.9705400981996726, |
| "grad_norm": 0.50390625, |
| "learning_rate": 2.5020662789779555e-06, |
| "loss": 0.3872, |
| "step": 1815 |
| }, |
| { |
| "epoch": 2.978723404255319, |
| "grad_norm": 0.55078125, |
| "learning_rate": 2.501121904933595e-06, |
| "loss": 0.3835, |
| "step": 1820 |
| }, |
| { |
| "epoch": 2.986906710310966, |
| "grad_norm": 0.53125, |
| "learning_rate": 2.5004636579511843e-06, |
| "loss": 0.4001, |
| "step": 1825 |
| }, |
| { |
| "epoch": 2.9950900163666123, |
| "grad_norm": 0.53125, |
| "learning_rate": 2.5000915882703615e-06, |
| "loss": 0.3819, |
| "step": 1830 |
| }, |
| { |
| "epoch": 2.9950900163666123, |
| "eval_loss": 0.3913484513759613, |
| "eval_runtime": 5.0457, |
| "eval_samples_per_second": 16.45, |
| "eval_steps_per_second": 16.45, |
| "step": 1830 |
| } |
| ], |
| "logging_steps": 5, |
| "max_steps": 1833, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 3, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 3.0422166028387615e+18, |
| "train_batch_size": 48, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|