| { |
| "best_global_step": 1408, |
| "best_metric": 0.3940983712673187, |
| "best_model_checkpoint": "/home/ubuntu/mnt/dattafs/train/llama_openr1_sft/checkpoint-1408", |
| "epoch": 1.9995726860952057, |
| "eval_steps": 32, |
| "global_step": 1462, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.021878471925476456, |
| "grad_norm": 1.7315069437026978, |
| "learning_rate": 5.4421768707483e-06, |
| "loss": 0.7967, |
| "num_tokens": 11790317.0, |
| "step": 16 |
| }, |
| { |
| "epoch": 0.04375694385095291, |
| "grad_norm": 0.6117880344390869, |
| "learning_rate": 1.08843537414966e-05, |
| "loss": 0.6135, |
| "step": 32 |
| }, |
| { |
| "epoch": 0.04375694385095291, |
| "eval_loss": 0.5614376664161682, |
| "eval_num_tokens": 23652572.0, |
| "eval_runtime": 18.6204, |
| "eval_samples_per_second": 6.874, |
| "eval_steps_per_second": 1.719, |
| "step": 32 |
| }, |
| { |
| "epoch": 0.06563541577642937, |
| "grad_norm": 0.5508595108985901, |
| "learning_rate": 1.6326530612244897e-05, |
| "loss": 0.5603, |
| "num_tokens": 35271932.0, |
| "step": 48 |
| }, |
| { |
| "epoch": 0.08751388770190582, |
| "grad_norm": 0.980832576751709, |
| "learning_rate": 2.17687074829932e-05, |
| "loss": 0.5404, |
| "step": 64 |
| }, |
| { |
| "epoch": 0.08751388770190582, |
| "eval_loss": 0.5128401517868042, |
| "eval_num_tokens": 46694102.0, |
| "eval_runtime": 43.903, |
| "eval_samples_per_second": 2.916, |
| "eval_steps_per_second": 0.729, |
| "step": 64 |
| }, |
| { |
| "epoch": 0.10939235962738228, |
| "grad_norm": 0.918790876865387, |
| "learning_rate": 2.72108843537415e-05, |
| "loss": 0.5293, |
| "num_tokens": 58386969.0, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.13127083155285874, |
| "grad_norm": 1.004357933998108, |
| "learning_rate": 3.265306122448979e-05, |
| "loss": 0.5192, |
| "step": 96 |
| }, |
| { |
| "epoch": 0.13127083155285874, |
| "eval_loss": 0.5079865455627441, |
| "eval_num_tokens": 69750037.0, |
| "eval_runtime": 18.3625, |
| "eval_samples_per_second": 6.971, |
| "eval_steps_per_second": 1.743, |
| "step": 96 |
| }, |
| { |
| "epoch": 0.1531493034783352, |
| "grad_norm": 1.4387233257293701, |
| "learning_rate": 3.809523809523809e-05, |
| "loss": 0.5224, |
| "num_tokens": 81535265.0, |
| "step": 112 |
| }, |
| { |
| "epoch": 0.17502777540381165, |
| "grad_norm": 1.11899733543396, |
| "learning_rate": 4.35374149659864e-05, |
| "loss": 0.5234, |
| "step": 128 |
| }, |
| { |
| "epoch": 0.17502777540381165, |
| "eval_loss": 0.5003859400749207, |
| "eval_num_tokens": 92959530.0, |
| "eval_runtime": 44.753, |
| "eval_samples_per_second": 2.86, |
| "eval_steps_per_second": 0.715, |
| "step": 128 |
| }, |
| { |
| "epoch": 0.1969062473292881, |
| "grad_norm": 1.0003859996795654, |
| "learning_rate": 4.89795918367347e-05, |
| "loss": 0.5166, |
| "num_tokens": 104537897.0, |
| "step": 144 |
| }, |
| { |
| "epoch": 0.21878471925476456, |
| "grad_norm": 0.7199622392654419, |
| "learning_rate": 4.99879438109886e-05, |
| "loss": 0.5217, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.21878471925476456, |
| "eval_loss": 0.49373239278793335, |
| "eval_num_tokens": 116348744.0, |
| "eval_runtime": 18.4241, |
| "eval_samples_per_second": 6.947, |
| "eval_steps_per_second": 1.737, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.24066319118024102, |
| "grad_norm": 0.7497566938400269, |
| "learning_rate": 4.9940023582279216e-05, |
| "loss": 0.5074, |
| "num_tokens": 127932733.0, |
| "step": 176 |
| }, |
| { |
| "epoch": 0.26254166310571747, |
| "grad_norm": 0.5236051082611084, |
| "learning_rate": 4.985566722849454e-05, |
| "loss": 0.504, |
| "step": 192 |
| }, |
| { |
| "epoch": 0.26254166310571747, |
| "eval_loss": 0.4837675094604492, |
| "eval_num_tokens": 139301330.0, |
| "eval_runtime": 18.3533, |
| "eval_samples_per_second": 6.974, |
| "eval_steps_per_second": 1.744, |
| "step": 192 |
| }, |
| { |
| "epoch": 0.2844201350311939, |
| "grad_norm": 0.5282280445098877, |
| "learning_rate": 4.973499799004161e-05, |
| "loss": 0.501, |
| "num_tokens": 151017333.0, |
| "step": 208 |
| }, |
| { |
| "epoch": 0.3062986069566704, |
| "grad_norm": 0.7268691062927246, |
| "learning_rate": 4.957819215863282e-05, |
| "loss": 0.4915, |
| "step": 224 |
| }, |
| { |
| "epoch": 0.3062986069566704, |
| "eval_loss": 0.476335346698761, |
| "eval_num_tokens": 162752028.0, |
| "eval_runtime": 18.509, |
| "eval_samples_per_second": 6.916, |
| "eval_steps_per_second": 1.729, |
| "step": 224 |
| }, |
| { |
| "epoch": 0.3281770788821468, |
| "grad_norm": 0.4880088269710541, |
| "learning_rate": 4.9385478819732645e-05, |
| "loss": 0.493, |
| "num_tokens": 174636003.0, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.3500555508076233, |
| "grad_norm": 0.8152353763580322, |
| "learning_rate": 4.9157139517875176e-05, |
| "loss": 0.4993, |
| "step": 256 |
| }, |
| { |
| "epoch": 0.3500555508076233, |
| "eval_loss": 0.47467267513275146, |
| "eval_num_tokens": 186441099.0, |
| "eval_runtime": 18.5278, |
| "eval_samples_per_second": 6.909, |
| "eval_steps_per_second": 1.727, |
| "step": 256 |
| }, |
| { |
| "epoch": 0.3719340227330997, |
| "grad_norm": 0.5629775524139404, |
| "learning_rate": 4.889350784534168e-05, |
| "loss": 0.4943, |
| "num_tokens": 198380851.0, |
| "step": 272 |
| }, |
| { |
| "epoch": 0.3938124946585762, |
| "grad_norm": 0.5178916454315186, |
| "learning_rate": 4.859496895479903e-05, |
| "loss": 0.4932, |
| "step": 288 |
| }, |
| { |
| "epoch": 0.3938124946585762, |
| "eval_loss": 0.4674685001373291, |
| "eval_num_tokens": 210236651.0, |
| "eval_runtime": 46.2968, |
| "eval_samples_per_second": 2.765, |
| "eval_steps_per_second": 0.691, |
| "step": 288 |
| }, |
| { |
| "epoch": 0.41569096658405263, |
| "grad_norm": 0.41224056482315063, |
| "learning_rate": 4.8261958996610914e-05, |
| "loss": 0.4811, |
| "num_tokens": 221999996.0, |
| "step": 304 |
| }, |
| { |
| "epoch": 0.4375694385095291, |
| "grad_norm": 0.5033276081085205, |
| "learning_rate": 4.7894964481643984e-05, |
| "loss": 0.4833, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.4375694385095291, |
| "eval_loss": 0.4603790044784546, |
| "eval_num_tokens": 233668486.0, |
| "eval_runtime": 18.4345, |
| "eval_samples_per_second": 6.943, |
| "eval_steps_per_second": 1.736, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.45944791043500555, |
| "grad_norm": 0.4538171887397766, |
| "learning_rate": 4.7494521570499914e-05, |
| "loss": 0.4769, |
| "num_tokens": 245229299.0, |
| "step": 336 |
| }, |
| { |
| "epoch": 0.48132638236048203, |
| "grad_norm": 0.44999656081199646, |
| "learning_rate": 4.706121529021158e-05, |
| "loss": 0.476, |
| "step": 352 |
| }, |
| { |
| "epoch": 0.48132638236048203, |
| "eval_loss": 0.45518478751182556, |
| "eval_num_tokens": 256808967.0, |
| "eval_runtime": 44.9829, |
| "eval_samples_per_second": 2.846, |
| "eval_steps_per_second": 0.711, |
| "step": 352 |
| }, |
| { |
| "epoch": 0.5032048542859585, |
| "grad_norm": 0.4638039171695709, |
| "learning_rate": 4.659567867954784e-05, |
| "loss": 0.4792, |
| "num_tokens": 268354735.0, |
| "step": 368 |
| }, |
| { |
| "epoch": 0.5250833262114349, |
| "grad_norm": 0.43113598227500916, |
| "learning_rate": 4.6098591864175696e-05, |
| "loss": 0.4666, |
| "step": 384 |
| }, |
| { |
| "epoch": 0.5250833262114349, |
| "eval_loss": 0.45199862122535706, |
| "eval_num_tokens": 279922749.0, |
| "eval_runtime": 18.4358, |
| "eval_samples_per_second": 6.943, |
| "eval_steps_per_second": 1.736, |
| "step": 384 |
| }, |
| { |
| "epoch": 0.5469617981369114, |
| "grad_norm": 0.44990119338035583, |
| "learning_rate": 4.557068106303067e-05, |
| "loss": 0.467, |
| "num_tokens": 291496113.0, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.5688402700623878, |
| "grad_norm": 0.4064292907714844, |
| "learning_rate": 4.501271752734737e-05, |
| "loss": 0.4635, |
| "step": 416 |
| }, |
| { |
| "epoch": 0.5688402700623878, |
| "eval_loss": 0.4471489489078522, |
| "eval_num_tokens": 303134880.0, |
| "eval_runtime": 44.7579, |
| "eval_samples_per_second": 2.86, |
| "eval_steps_per_second": 0.715, |
| "step": 416 |
| }, |
| { |
| "epoch": 0.5907187419878643, |
| "grad_norm": 0.4212772846221924, |
| "learning_rate": 4.442551641390008e-05, |
| "loss": 0.46, |
| "num_tokens": 314857074.0, |
| "step": 432 |
| }, |
| { |
| "epoch": 0.6125972139133408, |
| "grad_norm": 0.43323245644569397, |
| "learning_rate": 4.3809935594099515e-05, |
| "loss": 0.4614, |
| "step": 448 |
| }, |
| { |
| "epoch": 0.6125972139133408, |
| "eval_loss": 0.442826509475708, |
| "eval_num_tokens": 326457951.0, |
| "eval_runtime": 18.6512, |
| "eval_samples_per_second": 6.863, |
| "eval_steps_per_second": 1.716, |
| "step": 448 |
| }, |
| { |
| "epoch": 0.6344756858388172, |
| "grad_norm": 0.33663076162338257, |
| "learning_rate": 4.3166874400685694e-05, |
| "loss": 0.4593, |
| "num_tokens": 337990615.0, |
| "step": 464 |
| }, |
| { |
| "epoch": 0.6563541577642936, |
| "grad_norm": 0.537545919418335, |
| "learning_rate": 4.2497272313847825e-05, |
| "loss": 0.4582, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.6563541577642936, |
| "eval_loss": 0.43967390060424805, |
| "eval_num_tokens": 349803965.0, |
| "eval_runtime": 18.4027, |
| "eval_samples_per_second": 6.956, |
| "eval_steps_per_second": 1.739, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.67823262968977, |
| "grad_norm": 0.4213317334651947, |
| "learning_rate": 4.1802107588690856e-05, |
| "loss": 0.4565, |
| "num_tokens": 361540653.0, |
| "step": 496 |
| }, |
| { |
| "epoch": 0.7001111016152466, |
| "grad_norm": 0.3804231882095337, |
| "learning_rate": 4.108239582605374e-05, |
| "loss": 0.4545, |
| "step": 512 |
| }, |
| { |
| "epoch": 0.7001111016152466, |
| "eval_loss": 0.4352826476097107, |
| "eval_num_tokens": 373230587.0, |
| "eval_runtime": 18.5333, |
| "eval_samples_per_second": 6.906, |
| "eval_steps_per_second": 1.727, |
| "step": 512 |
| }, |
| { |
| "epoch": 0.721989573540723, |
| "grad_norm": 0.40216949582099915, |
| "learning_rate": 4.033918848876751e-05, |
| "loss": 0.4513, |
| "num_tokens": 384593207.0, |
| "step": 528 |
| }, |
| { |
| "epoch": 0.7438680454661994, |
| "grad_norm": 0.39182427525520325, |
| "learning_rate": 3.957357136552072e-05, |
| "loss": 0.4457, |
| "step": 544 |
| }, |
| { |
| "epoch": 0.7438680454661994, |
| "eval_loss": 0.43387627601623535, |
| "eval_num_tokens": 395921551.0, |
| "eval_runtime": 18.4858, |
| "eval_samples_per_second": 6.924, |
| "eval_steps_per_second": 1.731, |
| "step": 544 |
| }, |
| { |
| "epoch": 0.7657465173916759, |
| "grad_norm": 0.4319497346878052, |
| "learning_rate": 3.8786662984576605e-05, |
| "loss": 0.449, |
| "num_tokens": 407202540.0, |
| "step": 560 |
| }, |
| { |
| "epoch": 0.7876249893171524, |
| "grad_norm": 0.41336458921432495, |
| "learning_rate": 3.79796129796593e-05, |
| "loss": 0.4501, |
| "step": 576 |
| }, |
| { |
| "epoch": 0.7876249893171524, |
| "eval_loss": 0.4308420419692993, |
| "eval_num_tokens": 418778095.0, |
| "eval_runtime": 45.2458, |
| "eval_samples_per_second": 2.829, |
| "eval_steps_per_second": 0.707, |
| "step": 576 |
| }, |
| { |
| "epoch": 0.8095034612426288, |
| "grad_norm": 0.43935731053352356, |
| "learning_rate": 3.715360041039655e-05, |
| "loss": 0.4463, |
| "num_tokens": 430448376.0, |
| "step": 592 |
| }, |
| { |
| "epoch": 0.8313819331681053, |
| "grad_norm": 0.3827608525753021, |
| "learning_rate": 3.6309832039772707e-05, |
| "loss": 0.4445, |
| "step": 608 |
| }, |
| { |
| "epoch": 0.8313819331681053, |
| "eval_loss": 0.4279908537864685, |
| "eval_num_tokens": 441738429.0, |
| "eval_runtime": 18.3837, |
| "eval_samples_per_second": 6.963, |
| "eval_steps_per_second": 1.741, |
| "step": 608 |
| }, |
| { |
| "epoch": 0.8532604050935817, |
| "grad_norm": 0.3697313368320465, |
| "learning_rate": 3.544954057110839e-05, |
| "loss": 0.4484, |
| "num_tokens": 453881118.0, |
| "step": 624 |
| }, |
| { |
| "epoch": 0.8751388770190582, |
| "grad_norm": 0.37059286236763, |
| "learning_rate": 3.457398284714275e-05, |
| "loss": 0.4403, |
| "step": 640 |
| }, |
| { |
| "epoch": 0.8751388770190582, |
| "eval_loss": 0.42479920387268066, |
| "eval_num_tokens": 465534133.0, |
| "eval_runtime": 45.8322, |
| "eval_samples_per_second": 2.793, |
| "eval_steps_per_second": 0.698, |
| "step": 640 |
| }, |
| { |
| "epoch": 0.8970173489445347, |
| "grad_norm": 0.3813212513923645, |
| "learning_rate": 3.3684438013849154e-05, |
| "loss": 0.4401, |
| "num_tokens": 476853966.0, |
| "step": 656 |
| }, |
| { |
| "epoch": 0.9188958208700111, |
| "grad_norm": 0.32739850878715515, |
| "learning_rate": 3.2782205651667013e-05, |
| "loss": 0.4392, |
| "step": 672 |
| }, |
| { |
| "epoch": 0.9188958208700111, |
| "eval_loss": 0.4213526248931885, |
| "eval_num_tokens": 488461810.0, |
| "eval_runtime": 18.2948, |
| "eval_samples_per_second": 6.997, |
| "eval_steps_per_second": 1.749, |
| "step": 672 |
| }, |
| { |
| "epoch": 0.9407742927954875, |
| "grad_norm": 0.34722763299942017, |
| "learning_rate": 3.186860387687986e-05, |
| "loss": 0.4433, |
| "num_tokens": 500387798.0, |
| "step": 688 |
| }, |
| { |
| "epoch": 0.9626527647209641, |
| "grad_norm": 0.3709475100040436, |
| "learning_rate": 3.094496741591349e-05, |
| "loss": 0.4418, |
| "step": 704 |
| }, |
| { |
| "epoch": 0.9626527647209641, |
| "eval_loss": 0.41918742656707764, |
| "eval_num_tokens": 512072235.0, |
| "eval_runtime": 18.1717, |
| "eval_samples_per_second": 7.044, |
| "eval_steps_per_second": 1.761, |
| "step": 704 |
| }, |
| { |
| "epoch": 0.9845312366464405, |
| "grad_norm": 0.3451824188232422, |
| "learning_rate": 3.00126456553675e-05, |
| "loss": 0.4338, |
| "num_tokens": 523848298.0, |
| "step": 720 |
| }, |
| { |
| "epoch": 1.0068370224767114, |
| "grad_norm": 0.517411470413208, |
| "learning_rate": 2.9073000670629098e-05, |
| "loss": 0.4445, |
| "step": 736 |
| }, |
| { |
| "epoch": 1.0068370224767114, |
| "eval_loss": 0.4266531467437744, |
| "eval_num_tokens": 535523389.0, |
| "eval_runtime": 18.4611, |
| "eval_samples_per_second": 6.934, |
| "eval_steps_per_second": 1.733, |
| "step": 736 |
| }, |
| { |
| "epoch": 1.028715494402188, |
| "grad_norm": 0.3714677691459656, |
| "learning_rate": 2.8127405235949174e-05, |
| "loss": 0.3777, |
| "num_tokens": 547246764.0, |
| "step": 752 |
| }, |
| { |
| "epoch": 1.0505939663276642, |
| "grad_norm": 0.3552056550979614, |
| "learning_rate": 2.7177240818887893e-05, |
| "loss": 0.3742, |
| "step": 768 |
| }, |
| { |
| "epoch": 1.0505939663276642, |
| "eval_loss": 0.421397864818573, |
| "eval_num_tokens": 558829084.0, |
| "eval_runtime": 18.4181, |
| "eval_samples_per_second": 6.95, |
| "eval_steps_per_second": 1.737, |
| "step": 768 |
| }, |
| { |
| "epoch": 1.0724724382531408, |
| "grad_norm": 0.3726598620414734, |
| "learning_rate": 2.6223895562059786e-05, |
| "loss": 0.3683, |
| "num_tokens": 570404128.0, |
| "step": 784 |
| }, |
| { |
| "epoch": 1.0943509101786173, |
| "grad_norm": 0.33536332845687866, |
| "learning_rate": 2.5268762255126948e-05, |
| "loss": 0.3658, |
| "step": 800 |
| }, |
| { |
| "epoch": 1.0943509101786173, |
| "eval_loss": 0.41810134053230286, |
| "eval_num_tokens": 582073105.0, |
| "eval_runtime": 43.7846, |
| "eval_samples_per_second": 2.923, |
| "eval_steps_per_second": 0.731, |
| "step": 800 |
| }, |
| { |
| "epoch": 1.1162293821040936, |
| "grad_norm": 0.32326406240463257, |
| "learning_rate": 2.4313236300003103e-05, |
| "loss": 0.3683, |
| "num_tokens": 593969991.0, |
| "step": 816 |
| }, |
| { |
| "epoch": 1.1381078540295702, |
| "grad_norm": 0.3347606956958771, |
| "learning_rate": 2.33587136722413e-05, |
| "loss": 0.3697, |
| "step": 832 |
| }, |
| { |
| "epoch": 1.1381078540295702, |
| "eval_loss": 0.41542908549308777, |
| "eval_num_tokens": 605516295.0, |
| "eval_runtime": 18.4764, |
| "eval_samples_per_second": 6.928, |
| "eval_steps_per_second": 1.732, |
| "step": 832 |
| }, |
| { |
| "epoch": 1.1599863259550465, |
| "grad_norm": 0.34967222809791565, |
| "learning_rate": 2.2406588881583594e-05, |
| "loss": 0.3639, |
| "num_tokens": 617129829.0, |
| "step": 848 |
| }, |
| { |
| "epoch": 1.181864797880523, |
| "grad_norm": 0.3748638927936554, |
| "learning_rate": 2.1458252934652146e-05, |
| "loss": 0.366, |
| "step": 864 |
| }, |
| { |
| "epoch": 1.181864797880523, |
| "eval_loss": 0.41449588537216187, |
| "eval_num_tokens": 628546914.0, |
| "eval_runtime": 45.7716, |
| "eval_samples_per_second": 2.796, |
| "eval_steps_per_second": 0.699, |
| "step": 864 |
| }, |
| { |
| "epoch": 1.2037432698059995, |
| "grad_norm": 0.33047473430633545, |
| "learning_rate": 2.0515091302758217e-05, |
| "loss": 0.3634, |
| "num_tokens": 640328734.0, |
| "step": 880 |
| }, |
| { |
| "epoch": 1.2256217417314759, |
| "grad_norm": 0.34247222542762756, |
| "learning_rate": 1.9578481897798028e-05, |
| "loss": 0.3618, |
| "step": 896 |
| }, |
| { |
| "epoch": 1.2256217417314759, |
| "eval_loss": 0.4123848080635071, |
| "eval_num_tokens": 651969596.0, |
| "eval_runtime": 18.5044, |
| "eval_samples_per_second": 6.917, |
| "eval_steps_per_second": 1.729, |
| "step": 896 |
| }, |
| { |
| "epoch": 1.2475002136569524, |
| "grad_norm": 0.3337569534778595, |
| "learning_rate": 1.864979305919248e-05, |
| "loss": 0.3651, |
| "num_tokens": 663693234.0, |
| "step": 912 |
| }, |
| { |
| "epoch": 1.2693786855824287, |
| "grad_norm": 0.3318181335926056, |
| "learning_rate": 1.7730381554811815e-05, |
| "loss": 0.3625, |
| "step": 928 |
| }, |
| { |
| "epoch": 1.2693786855824287, |
| "eval_loss": 0.4105568528175354, |
| "eval_num_tokens": 675331630.0, |
| "eval_runtime": 44.7946, |
| "eval_samples_per_second": 2.857, |
| "eval_steps_per_second": 0.714, |
| "step": 928 |
| }, |
| { |
| "epoch": 1.2912571575079053, |
| "grad_norm": 0.3317345678806305, |
| "learning_rate": 1.6821590598805708e-05, |
| "loss": 0.3615, |
| "num_tokens": 686943119.0, |
| "step": 944 |
| }, |
| { |
| "epoch": 1.3131356294333818, |
| "grad_norm": 0.3148461580276489, |
| "learning_rate": 1.5924747889234743e-05, |
| "loss": 0.3602, |
| "step": 960 |
| }, |
| { |
| "epoch": 1.3131356294333818, |
| "eval_loss": 0.40809541940689087, |
| "eval_num_tokens": 698738800.0, |
| "eval_runtime": 18.4793, |
| "eval_samples_per_second": 6.927, |
| "eval_steps_per_second": 1.732, |
| "step": 960 |
| }, |
| { |
| "epoch": 1.3350141013588583, |
| "grad_norm": 0.3473955988883972, |
| "learning_rate": 1.5041163668369939e-05, |
| "loss": 0.3595, |
| "num_tokens": 710486757.0, |
| "step": 976 |
| }, |
| { |
| "epoch": 1.3568925732843347, |
| "grad_norm": 0.2996233105659485, |
| "learning_rate": 1.4172128808494572e-05, |
| "loss": 0.358, |
| "step": 992 |
| }, |
| { |
| "epoch": 1.3568925732843347, |
| "eval_loss": 0.406377911567688, |
| "eval_num_tokens": 722294728.0, |
| "eval_runtime": 18.2468, |
| "eval_samples_per_second": 7.015, |
| "eval_steps_per_second": 1.754, |
| "step": 992 |
| }, |
| { |
| "epoch": 1.3787710452098112, |
| "grad_norm": 0.31939494609832764, |
| "learning_rate": 1.3318912926004351e-05, |
| "loss": 0.3543, |
| "num_tokens": 733756855.0, |
| "step": 1008 |
| }, |
| { |
| "epoch": 1.4006495171352875, |
| "grad_norm": 0.2936459481716156, |
| "learning_rate": 1.2482762526561448e-05, |
| "loss": 0.3626, |
| "step": 1024 |
| }, |
| { |
| "epoch": 1.4006495171352875, |
| "eval_loss": 0.40473318099975586, |
| "eval_num_tokens": 745739855.0, |
| "eval_runtime": 18.318, |
| "eval_samples_per_second": 6.988, |
| "eval_steps_per_second": 1.747, |
| "step": 1024 |
| }, |
| { |
| "epoch": 1.422527989060764, |
| "grad_norm": 0.29807737469673157, |
| "learning_rate": 1.1664899184012229e-05, |
| "loss": 0.3576, |
| "num_tokens": 757373615.0, |
| "step": 1040 |
| }, |
| { |
| "epoch": 1.4444064609862406, |
| "grad_norm": 0.31612807512283325, |
| "learning_rate": 1.0866517755729063e-05, |
| "loss": 0.3584, |
| "step": 1056 |
| }, |
| { |
| "epoch": 1.4444064609862406, |
| "eval_loss": 0.40248239040374756, |
| "eval_num_tokens": 769145930.0, |
| "eval_runtime": 18.5352, |
| "eval_samples_per_second": 6.906, |
| "eval_steps_per_second": 1.726, |
| "step": 1056 |
| }, |
| { |
| "epoch": 1.466284932911717, |
| "grad_norm": 0.2999597191810608, |
| "learning_rate": 1.0088784636983473e-05, |
| "loss": 0.3546, |
| "num_tokens": 780592475.0, |
| "step": 1072 |
| }, |
| { |
| "epoch": 1.4881634048371934, |
| "grad_norm": 0.30209967494010925, |
| "learning_rate": 9.332836056901176e-06, |
| "loss": 0.3531, |
| "step": 1088 |
| }, |
| { |
| "epoch": 1.4881634048371934, |
| "eval_loss": 0.4016121029853821, |
| "eval_num_tokens": 792419009.0, |
| "eval_runtime": 45.144, |
| "eval_samples_per_second": 2.835, |
| "eval_steps_per_second": 0.709, |
| "step": 1088 |
| }, |
| { |
| "epoch": 1.5100418767626698, |
| "grad_norm": 0.28912022709846497, |
| "learning_rate": 8.599776418488159e-06, |
| "loss": 0.3476, |
| "num_tokens": 804175102.0, |
| "step": 1104 |
| }, |
| { |
| "epoch": 1.5319203486881463, |
| "grad_norm": 0.28878605365753174, |
| "learning_rate": 7.890676685153314e-06, |
| "loss": 0.351, |
| "step": 1120 |
| }, |
| { |
| "epoch": 1.5319203486881463, |
| "eval_loss": 0.3999693691730499, |
| "eval_num_tokens": 815878094.0, |
| "eval_runtime": 18.5504, |
| "eval_samples_per_second": 6.9, |
| "eval_steps_per_second": 1.725, |
| "step": 1120 |
| }, |
| { |
| "epoch": 1.5537988206136228, |
| "grad_norm": 0.29332441091537476, |
| "learning_rate": 7.206572816084464e-06, |
| "loss": 0.3525, |
| "num_tokens": 827490302.0, |
| "step": 1136 |
| }, |
| { |
| "epoch": 1.5756772925390994, |
| "grad_norm": 0.2913924753665924, |
| "learning_rate": 6.5484642527639055e-06, |
| "loss": 0.3529, |
| "step": 1152 |
| }, |
| { |
| "epoch": 1.5756772925390994, |
| "eval_loss": 0.39920222759246826, |
| "eval_num_tokens": 839135863.0, |
| "eval_runtime": 45.1069, |
| "eval_samples_per_second": 2.838, |
| "eval_steps_per_second": 0.709, |
| "step": 1152 |
| }, |
| { |
| "epoch": 1.5975557644645757, |
| "grad_norm": 0.2814177870750427, |
| "learning_rate": 5.917312458834495e-06, |
| "loss": 0.353, |
| "num_tokens": 850717611.0, |
| "step": 1168 |
| }, |
| { |
| "epoch": 1.619434236390052, |
| "grad_norm": 0.2677833139896393, |
| "learning_rate": 5.314039515449418e-06, |
| "loss": 0.3538, |
| "step": 1184 |
| }, |
| { |
| "epoch": 1.619434236390052, |
| "eval_loss": 0.397605836391449, |
| "eval_num_tokens": 862683824.0, |
| "eval_runtime": 18.3743, |
| "eval_samples_per_second": 6.966, |
| "eval_steps_per_second": 1.742, |
| "step": 1184 |
| }, |
| { |
| "epoch": 1.6413127083155286, |
| "grad_norm": 0.27799126505851746, |
| "learning_rate": 4.739526774157807e-06, |
| "loss": 0.3505, |
| "num_tokens": 874236055.0, |
| "step": 1200 |
| }, |
| { |
| "epoch": 1.663191180241005, |
| "grad_norm": 0.2723589241504669, |
| "learning_rate": 4.19461356929429e-06, |
| "loss": 0.3524, |
| "step": 1216 |
| }, |
| { |
| "epoch": 1.663191180241005, |
| "eval_loss": 0.3971025049686432, |
| "eval_num_tokens": 885655282.0, |
| "eval_runtime": 18.1636, |
| "eval_samples_per_second": 7.047, |
| "eval_steps_per_second": 1.762, |
| "step": 1216 |
| }, |
| { |
| "epoch": 1.6850696521664816, |
| "grad_norm": 0.27828726172447205, |
| "learning_rate": 3.6800959917535765e-06, |
| "loss": 0.349, |
| "num_tokens": 897507679.0, |
| "step": 1232 |
| }, |
| { |
| "epoch": 1.706948124091958, |
| "grad_norm": 0.2791634798049927, |
| "learning_rate": 3.1967257259415185e-06, |
| "loss": 0.3458, |
| "step": 1248 |
| }, |
| { |
| "epoch": 1.706948124091958, |
| "eval_loss": 0.3961884379386902, |
| "eval_num_tokens": 908891789.0, |
| "eval_runtime": 18.2964, |
| "eval_samples_per_second": 6.996, |
| "eval_steps_per_second": 1.749, |
| "step": 1248 |
| }, |
| { |
| "epoch": 1.7288265960174343, |
| "grad_norm": 0.27799129486083984, |
| "learning_rate": 2.7452089516018935e-06, |
| "loss": 0.3519, |
| "num_tokens": 920647977.0, |
| "step": 1264 |
| }, |
| { |
| "epoch": 1.7507050679429108, |
| "grad_norm": 0.2728063762187958, |
| "learning_rate": 2.326205312123136e-06, |
| "loss": 0.3437, |
| "step": 1280 |
| }, |
| { |
| "epoch": 1.7507050679429108, |
| "eval_loss": 0.39544403553009033, |
| "eval_num_tokens": 931912973.0, |
| "eval_runtime": 18.3293, |
| "eval_samples_per_second": 6.983, |
| "eval_steps_per_second": 1.746, |
| "step": 1280 |
| }, |
| { |
| "epoch": 1.7725835398683873, |
| "grad_norm": 0.27384352684020996, |
| "learning_rate": 1.940326950832391e-06, |
| "loss": 0.3451, |
| "num_tokens": 943388074.0, |
| "step": 1296 |
| }, |
| { |
| "epoch": 1.7944620117938639, |
| "grad_norm": 0.25831547379493713, |
| "learning_rate": 1.5881376166848149e-06, |
| "loss": 0.3496, |
| "step": 1312 |
| }, |
| { |
| "epoch": 1.7944620117938639, |
| "eval_loss": 0.3953617215156555, |
| "eval_num_tokens": 955111326.0, |
| "eval_runtime": 44.1506, |
| "eval_samples_per_second": 2.899, |
| "eval_steps_per_second": 0.725, |
| "step": 1312 |
| }, |
| { |
| "epoch": 1.8163404837193402, |
| "grad_norm": 0.2703952193260193, |
| "learning_rate": 1.2701518406545571e-06, |
| "loss": 0.3486, |
| "num_tokens": 966913070.0, |
| "step": 1328 |
| }, |
| { |
| "epoch": 1.8382189556448165, |
| "grad_norm": 0.26095762848854065, |
| "learning_rate": 9.868341840307993e-07, |
| "loss": 0.3481, |
| "step": 1344 |
| }, |
| { |
| "epoch": 1.8382189556448165, |
| "eval_loss": 0.3945625424385071, |
| "eval_num_tokens": 978491221.0, |
| "eval_runtime": 18.31, |
| "eval_samples_per_second": 6.991, |
| "eval_steps_per_second": 1.748, |
| "step": 1344 |
| }, |
| { |
| "epoch": 1.860097427570293, |
| "grad_norm": 0.2432006448507309, |
| "learning_rate": 7.385985597169798e-07, |
| "loss": 0.351, |
| "num_tokens": 990169429.0, |
| "step": 1360 |
| }, |
| { |
| "epoch": 1.8819758994957696, |
| "grad_norm": 0.27600374817848206, |
| "learning_rate": 5.258076275247825e-07, |
| "loss": 0.3432, |
| "step": 1376 |
| }, |
| { |
| "epoch": 1.8819758994957696, |
| "eval_loss": 0.3942786455154419, |
| "eval_num_tokens": 1001694852.0, |
| "eval_runtime": 45.1966, |
| "eval_samples_per_second": 2.832, |
| "eval_steps_per_second": 0.708, |
| "step": 1376 |
| }, |
| { |
| "epoch": 1.9038543714212461, |
| "grad_norm": 0.2700289487838745, |
| "learning_rate": 3.4877226434630315e-07, |
| "loss": 0.3444, |
| "num_tokens": 1013251008.0, |
| "step": 1392 |
| }, |
| { |
| "epoch": 1.9257328433467225, |
| "grad_norm": 0.250434011220932, |
| "learning_rate": 2.0775110997850733e-07, |
| "loss": 0.345, |
| "step": 1408 |
| }, |
| { |
| "epoch": 1.9257328433467225, |
| "eval_loss": 0.3940983712673187, |
| "eval_num_tokens": 1025080577.0, |
| "eval_runtime": 18.3362, |
| "eval_samples_per_second": 6.981, |
| "eval_steps_per_second": 1.745, |
| "step": 1408 |
| }, |
| { |
| "epoch": 1.947611315272199, |
| "grad_norm": 0.2536456882953644, |
| "learning_rate": 1.0295018926342881e-07, |
| "loss": 0.3464, |
| "num_tokens": 1036518998.0, |
| "step": 1424 |
| }, |
| { |
| "epoch": 1.9694897871976753, |
| "grad_norm": 0.2704523205757141, |
| "learning_rate": 3.4522611096193815e-08, |
| "loss": 0.3504, |
| "step": 1440 |
| }, |
| { |
| "epoch": 1.9694897871976753, |
| "eval_loss": 0.39410996437072754, |
| "eval_num_tokens": 1048000170.0, |
| "eval_runtime": 45.2694, |
| "eval_samples_per_second": 2.828, |
| "eval_steps_per_second": 0.707, |
| "step": 1440 |
| }, |
| { |
| "epoch": 1.9913682591231519, |
| "grad_norm": 0.2705751061439514, |
| "learning_rate": 2.568344740602746e-09, |
| "loss": 0.3473, |
| "num_tokens": 1059695234.0, |
| "step": 1456 |
| }, |
| { |
| "epoch": 1.9995726860952057, |
| "num_tokens": 1063961915.0, |
| "step": 1462, |
| "total_flos": 1.1979767668153516e+19, |
| "train_loss": 0.4216477999771995, |
| "train_runtime": 170373.8054, |
| "train_samples_per_second": 1.099, |
| "train_steps_per_second": 0.009 |
| } |
| ], |
| "logging_steps": 16, |
| "max_steps": 1462, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 2, |
| "save_steps": 32, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 1.1979767668153516e+19, |
| "train_batch_size": 1, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|