{ "best_metric": null, "best_model_checkpoint": null, "epoch": 399.8, "eval_steps": 10, "global_step": 400, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio": 0.0, "completion_length": 606.59375, "epoch": 0.8, "grad_norm": 1.3200632494048739, "kl": 0.0, "learning_rate": 5e-08, "loss": 0.043, "reward": 11.62500011920929, "reward_std": 5.327881373465061, "rewards/accuracy_reward_staging": 0.9671875108033419, "rewards/format_reward": 0.96875, "rewards/format_reward_staging": 0.984375, "step": 1 }, { "clip_ratio": 0.0, "completion_length": 622.75, "epoch": 1.8, "grad_norm": 1.245967192961327, "kl": 0.0, "learning_rate": 1e-07, "loss": 0.0103, "reward": 11.717187702655792, "reward_std": 5.550888277590275, "rewards/accuracy_reward_staging": 0.973281254991889, "rewards/format_reward": 0.984375, "rewards/format_reward_staging": 1.0, "step": 2 }, { "clip_ratio": 0.0, "completion_length": 562.15625, "epoch": 2.8, "grad_norm": 1.3598583264581219, "kl": 0.0012388229370117188, "learning_rate": 1.5e-07, "loss": 0.0066, "reward": 11.326562702655792, "reward_std": 4.338181830942631, "rewards/accuracy_reward_staging": 0.935781279578805, "rewards/format_reward": 0.984375, "rewards/format_reward_staging": 0.984375, "step": 3 }, { "clip_ratio": 0.0, "completion_length": 633.84375, "epoch": 3.8, "grad_norm": 1.2847021449800877, "kl": 0.001153707504272461, "learning_rate": 2e-07, "loss": 0.0018, "reward": 11.992187529802322, "reward_std": 5.119553402066231, "rewards/accuracy_reward_staging": 1.0039062658324838, "rewards/format_reward": 0.96875, "rewards/format_reward_staging": 0.984375, "step": 4 }, { "clip_ratio": 0.0, "completion_length": 617.796875, "epoch": 4.8, "grad_norm": 1.3189665452513488, "kl": 0.0013086795806884766, "learning_rate": 2.5e-07, "loss": 0.085, "reward": 10.701562702655792, "reward_std": 5.677764259278774, "rewards/accuracy_reward_staging": 0.8795312605798244, "rewards/format_reward": 0.9375, "rewards/format_reward_staging": 0.96875, "step": 5 }, { "clip_ratio": 0.0, "completion_length": 610.96875, "epoch": 5.8, "grad_norm": 1.2985032171428867, "kl": 0.001140594482421875, "learning_rate": 3e-07, "loss": -0.0194, "reward": 11.771875321865082, "reward_std": 4.783194027841091, "rewards/accuracy_reward_staging": 0.9771875143051147, "rewards/format_reward": 1.0, "rewards/format_reward_staging": 1.0, "step": 6 }, { "clip_ratio": 0.0, "completion_length": 601.046875, "epoch": 6.8, "grad_norm": 1.3341628850896736, "kl": 0.0011951923370361328, "learning_rate": 3.5e-07, "loss": -0.0596, "reward": 10.256250113248825, "reward_std": 4.867078542709351, "rewards/accuracy_reward_staging": 0.827187517657876, "rewards/format_reward": 0.984375, "rewards/format_reward_staging": 1.0, "step": 7 }, { "clip_ratio": 0.0, "completion_length": 606.59375, "epoch": 7.8, "grad_norm": 1.285368092332121, "kl": 0.0010988712310791016, "learning_rate": 4e-07, "loss": -0.0081, "reward": 9.478125274181366, "reward_std": 3.7967969875317067, "rewards/accuracy_reward_staging": 0.7493750108405948, "rewards/format_reward": 1.0, "rewards/format_reward_staging": 0.984375, "step": 8 }, { "clip_ratio": 0.0, "completion_length": 622.796875, "epoch": 8.8, "grad_norm": 1.2619218024838643, "kl": 0.0011816024780273438, "learning_rate": 4.5e-07, "loss": 0.022, "reward": 9.204687714576721, "reward_std": 4.6997692584991455, "rewards/accuracy_reward_staging": 0.7235937705263495, "rewards/format_reward": 0.96875, "rewards/format_reward_staging": 1.0, "step": 9 }, { "epoch": 9.8, "grad_norm": 1.3090135878224105, "learning_rate": 5e-07, "loss": 0.0552, "step": 10 }, { "epoch": 9.8, "eval_clip_ratio": 0.0, "eval_completion_length": 598.6, "eval_kl": 0.001386260986328125, "eval_loss": -0.007845744490623474, "eval_reward": 10.647500252723693, "eval_reward_std": 5.141342180967331, "eval_rewards/accuracy_reward_staging": 0.8647500067949295, "eval_rewards/format_reward": 1.0, "eval_rewards/format_reward_staging": 1.0, "eval_runtime": 128.7421, "eval_samples_per_second": 0.155, "eval_steps_per_second": 0.039, "step": 10 }, { "clip_ratio": 0.0, "completion_length": 616.1640625, "epoch": 10.8, "grad_norm": 1.2259380562812106, "kl": 0.0011686086654663086, "learning_rate": 5.5e-07, "loss": 0.021, "reward": 10.721875354647636, "reward_std": 4.659499041736126, "rewards/accuracy_reward_staging": 0.8792187599465251, "rewards/format_reward": 0.9609375, "rewards/format_reward_staging": 0.96875, "step": 11 }, { "clip_ratio": 0.0, "completion_length": 614.578125, "epoch": 11.8, "grad_norm": 1.1553077299419614, "kl": 0.0011510848999023438, "learning_rate": 6e-07, "loss": -0.025, "reward": 10.503125250339508, "reward_std": 5.1386475414037704, "rewards/accuracy_reward_staging": 0.8581250142306089, "rewards/format_reward": 0.953125, "rewards/format_reward_staging": 0.96875, "step": 12 }, { "clip_ratio": 0.0, "completion_length": 612.90625, "epoch": 12.8, "grad_norm": 1.3576949395163465, "kl": 0.0011174678802490234, "learning_rate": 6.5e-07, "loss": -0.0006, "reward": 10.643750101327896, "reward_std": 4.954892493784428, "rewards/accuracy_reward_staging": 0.8737500086426735, "rewards/format_reward": 0.953125, "rewards/format_reward_staging": 0.953125, "step": 13 }, { "clip_ratio": 0.0, "completion_length": 628.53125, "epoch": 13.8, "grad_norm": 1.2887380340653378, "kl": 0.0011186599731445312, "learning_rate": 7e-07, "loss": -0.0706, "reward": 8.992187768220901, "reward_std": 4.08132154494524, "rewards/accuracy_reward_staging": 0.7023437591269612, "rewards/format_reward": 0.96875, "rewards/format_reward_staging": 1.0, "step": 14 }, { "clip_ratio": 0.0, "completion_length": 655.21875, "epoch": 14.8, "grad_norm": 1.3384872721535677, "kl": 0.0011413097381591797, "learning_rate": 7.5e-07, "loss": -0.0176, "reward": 10.270312935113907, "reward_std": 5.108375668525696, "rewards/accuracy_reward_staging": 0.837968748062849, "rewards/format_reward": 0.9375, "rewards/format_reward_staging": 0.953125, "step": 15 }, { "clip_ratio": 0.0, "completion_length": 620.21875, "epoch": 15.8, "grad_norm": 1.3078665462402237, "kl": 0.0012836456298828125, "learning_rate": 8e-07, "loss": -0.0292, "reward": 10.915625095367432, "reward_std": 5.460881970822811, "rewards/accuracy_reward_staging": 0.8946875166147947, "rewards/format_reward": 0.96875, "rewards/format_reward_staging": 1.0, "step": 16 }, { "clip_ratio": 0.0, "completion_length": 619.3125, "epoch": 16.8, "grad_norm": 1.3598193439821062, "kl": 0.0014238357543945312, "learning_rate": 8.499999999999999e-07, "loss": 0.008, "reward": 10.659375220537186, "reward_std": 5.2481329292058945, "rewards/accuracy_reward_staging": 0.8690625205636024, "rewards/format_reward": 0.96875, "rewards/format_reward_staging": 1.0, "step": 17 }, { "clip_ratio": 0.0, "completion_length": 649.84375, "epoch": 17.8, "grad_norm": 1.3496406035053183, "kl": 0.0015878677368164062, "learning_rate": 9e-07, "loss": 0.0445, "reward": 10.225000023841858, "reward_std": 4.735325090587139, "rewards/accuracy_reward_staging": 0.8225000277161598, "rewards/format_reward": 1.0, "rewards/format_reward_staging": 1.0, "step": 18 }, { "clip_ratio": 0.0, "completion_length": 572.265625, "epoch": 18.8, "grad_norm": 1.3727510323236323, "kl": 0.0017957687377929688, "learning_rate": 9.499999999999999e-07, "loss": 0.0497, "reward": 9.406250178813934, "reward_std": 4.422982223331928, "rewards/accuracy_reward_staging": 0.7437500189989805, "rewards/format_reward": 0.984375, "rewards/format_reward_staging": 0.984375, "step": 19 }, { "epoch": 19.8, "grad_norm": 1.2508053733094389, "learning_rate": 1e-06, "loss": -0.0193, "step": 20 }, { "epoch": 19.8, "eval_clip_ratio": 0.0, "eval_completion_length": 644.85, "eval_kl": 0.0017383575439453125, "eval_loss": 0.013418617658317089, "eval_reward": 12.118750143051148, "eval_reward_std": 5.42993243932724, "eval_rewards/accuracy_reward_staging": 1.0181250065565108, "eval_rewards/format_reward": 0.9375, "eval_rewards/format_reward_staging": 1.0, "eval_runtime": 138.2951, "eval_samples_per_second": 0.145, "eval_steps_per_second": 0.036, "step": 20 }, { "clip_ratio": 0.0, "completion_length": 618.1875, "epoch": 20.8, "grad_norm": 1.2769652330197667, "kl": 0.0017538070678710938, "learning_rate": 1.05e-06, "loss": 0.0169, "reward": 10.289843887090683, "reward_std": 4.1728136613965034, "rewards/accuracy_reward_staging": 0.8297656457871199, "rewards/format_reward": 0.9921875, "rewards/format_reward_staging": 1.0, "step": 21 }, { "clip_ratio": 0.0, "completion_length": 586.828125, "epoch": 21.8, "grad_norm": 1.2550140515179407, "kl": 0.0023107528686523438, "learning_rate": 1.1e-06, "loss": 0.0203, "reward": 10.453125178813934, "reward_std": 5.255412273108959, "rewards/accuracy_reward_staging": 0.8484374992549419, "rewards/format_reward": 0.984375, "rewards/format_reward_staging": 0.984375, "step": 22 }, { "clip_ratio": 0.0, "completion_length": 609.78125, "epoch": 22.8, "grad_norm": 1.1649225522797726, "kl": 0.0023360252380371094, "learning_rate": 1.1499999999999998e-06, "loss": 0.0301, "reward": 10.517187863588333, "reward_std": 4.380151428282261, "rewards/accuracy_reward_staging": 0.8532812558114529, "rewards/format_reward": 0.984375, "rewards/format_reward_staging": 1.0, "step": 23 }, { "clip_ratio": 0.0, "completion_length": 640.1875, "epoch": 23.8, "grad_norm": 1.2148463960481974, "kl": 0.0026197433471679688, "learning_rate": 1.2e-06, "loss": 0.0016, "reward": 10.815625190734863, "reward_std": 4.644554391503334, "rewards/accuracy_reward_staging": 0.8893750132992864, "rewards/format_reward": 0.9375, "rewards/format_reward_staging": 0.984375, "step": 24 }, { "clip_ratio": 0.0, "completion_length": 605.65625, "epoch": 24.8, "grad_norm": 1.2403401257649775, "kl": 0.0026373863220214844, "learning_rate": 1.2499999999999999e-06, "loss": 0.0521, "reward": 10.245312750339508, "reward_std": 4.28605642169714, "rewards/accuracy_reward_staging": 0.8323437552899122, "rewards/format_reward": 0.9375, "rewards/format_reward_staging": 0.984375, "step": 25 }, { "clip_ratio": 0.0, "completion_length": 598.609375, "epoch": 25.8, "grad_norm": 1.3147177916400645, "kl": 0.0031595230102539062, "learning_rate": 1.3e-06, "loss": 0.0414, "reward": 10.621875256299973, "reward_std": 5.236618235707283, "rewards/accuracy_reward_staging": 0.8684375081211329, "rewards/format_reward": 0.953125, "rewards/format_reward_staging": 0.984375, "step": 26 }, { "clip_ratio": 0.0, "completion_length": 625.40625, "epoch": 26.8, "grad_norm": 1.2154796498346485, "kl": 0.003124237060546875, "learning_rate": 1.35e-06, "loss": -0.0316, "reward": 11.971875131130219, "reward_std": 4.97715250402689, "rewards/accuracy_reward_staging": 1.0034375116229057, "rewards/format_reward": 0.96875, "rewards/format_reward_staging": 0.96875, "step": 27 }, { "clip_ratio": 0.0, "completion_length": 669.484375, "epoch": 27.8, "grad_norm": 1.2295951327912775, "kl": 0.004219532012939453, "learning_rate": 1.4e-06, "loss": 0.0328, "reward": 10.487500160932541, "reward_std": 4.095494709908962, "rewards/accuracy_reward_staging": 0.8581250123679638, "rewards/format_reward": 0.9375, "rewards/format_reward_staging": 0.96875, "step": 28 }, { "clip_ratio": 0.0, "completion_length": 642.875, "epoch": 28.8, "grad_norm": 1.322550495751781, "kl": 0.00588226318359375, "learning_rate": 1.4499999999999999e-06, "loss": 0.06, "reward": 10.221875250339508, "reward_std": 4.399149507284164, "rewards/accuracy_reward_staging": 0.8268750105053186, "rewards/format_reward": 0.984375, "rewards/format_reward_staging": 0.96875, "step": 29 }, { "epoch": 29.8, "grad_norm": 1.2977104318293018, "learning_rate": 1.5e-06, "loss": 0.0428, "step": 30 }, { "epoch": 29.8, "eval_clip_ratio": 0.0, "eval_completion_length": 615.9875, "eval_kl": 0.005646514892578125, "eval_loss": -0.0016943871742114425, "eval_reward": 10.84500024318695, "eval_reward_std": 4.436952286958695, "eval_rewards/accuracy_reward_staging": 0.8932500079274177, "eval_rewards/format_reward": 0.925, "eval_rewards/format_reward_staging": 0.9875, "eval_runtime": 140.4185, "eval_samples_per_second": 0.142, "eval_steps_per_second": 0.036, "step": 30 }, { "clip_ratio": 0.0, "completion_length": 602.734375, "epoch": 30.8, "grad_norm": 1.2642898418324884, "kl": 0.006984233856201172, "learning_rate": 1.55e-06, "loss": -0.0342, "reward": 10.63593776524067, "reward_std": 4.29075089469552, "rewards/accuracy_reward_staging": 0.865937520749867, "rewards/format_reward": 0.9765625, "rewards/format_reward_staging": 1.0, "step": 31 }, { "clip_ratio": 0.0, "completion_length": 562.6875, "epoch": 31.8, "grad_norm": 1.3685850711260872, "kl": 0.0068912506103515625, "learning_rate": 1.6e-06, "loss": 0.0138, "reward": 10.934375166893005, "reward_std": 5.272631503641605, "rewards/accuracy_reward_staging": 0.8996875174343586, "rewards/format_reward": 0.984375, "rewards/format_reward_staging": 0.953125, "step": 32 }, { "clip_ratio": 0.0, "completion_length": 612.984375, "epoch": 32.8, "grad_norm": 1.2202427650766963, "kl": 0.00725555419921875, "learning_rate": 1.6499999999999999e-06, "loss": 0.0152, "reward": 11.089062750339508, "reward_std": 5.698997817933559, "rewards/accuracy_reward_staging": 0.9151562694460154, "rewards/format_reward": 0.953125, "rewards/format_reward_staging": 0.984375, "step": 33 }, { "clip_ratio": 0.0, "completion_length": 637.859375, "epoch": 33.8, "grad_norm": 1.238938352076857, "kl": 0.00881195068359375, "learning_rate": 1.6999999999999998e-06, "loss": -0.0175, "reward": 10.721875220537186, "reward_std": 4.769842825829983, "rewards/accuracy_reward_staging": 0.8784375097602606, "rewards/format_reward": 0.9375, "rewards/format_reward_staging": 1.0, "step": 34 }, { "clip_ratio": 0.0, "completion_length": 659.125, "epoch": 34.8, "grad_norm": 1.2817921497759655, "kl": 0.009099960327148438, "learning_rate": 1.75e-06, "loss": -0.0368, "reward": 11.725000083446503, "reward_std": 5.023090958595276, "rewards/accuracy_reward_staging": 0.9881250187754631, "rewards/format_reward": 0.90625, "rewards/format_reward_staging": 0.9375, "step": 35 }, { "clip_ratio": 0.0, "completion_length": 627.671875, "epoch": 35.8, "grad_norm": 1.2427123786378356, "kl": 0.00952911376953125, "learning_rate": 1.8e-06, "loss": 0.0311, "reward": 10.635937720537186, "reward_std": 4.218031510710716, "rewards/accuracy_reward_staging": 0.8682812862098217, "rewards/format_reward": 0.96875, "rewards/format_reward_staging": 0.984375, "step": 36 }, { "clip_ratio": 0.0, "completion_length": 606.109375, "epoch": 36.8, "grad_norm": 1.2742370115467172, "kl": 0.012310028076171875, "learning_rate": 1.85e-06, "loss": 0.0282, "reward": 11.673437774181366, "reward_std": 3.9488272815942764, "rewards/accuracy_reward_staging": 0.9720312729477882, "rewards/format_reward": 0.953125, "rewards/format_reward_staging": 1.0, "step": 37 }, { "clip_ratio": 0.0, "completion_length": 651.328125, "epoch": 37.8, "grad_norm": 1.1726321374661877, "kl": 0.012788772583007812, "learning_rate": 1.8999999999999998e-06, "loss": 0.0163, "reward": 10.459375202655792, "reward_std": 4.296897903084755, "rewards/accuracy_reward_staging": 0.852187518030405, "rewards/format_reward": 0.953125, "rewards/format_reward_staging": 0.984375, "step": 38 }, { "clip_ratio": 0.0, "completion_length": 617.984375, "epoch": 38.8, "grad_norm": 1.2806374425181677, "kl": 0.017009735107421875, "learning_rate": 1.95e-06, "loss": -0.0057, "reward": 10.478125214576721, "reward_std": 4.7161330208182335, "rewards/accuracy_reward_staging": 0.8525000084191561, "rewards/format_reward": 0.953125, "rewards/format_reward_staging": 1.0, "step": 39 }, { "epoch": 39.8, "grad_norm": 1.362092657080068, "learning_rate": 2e-06, "loss": -0.0269, "step": 40 }, { "epoch": 39.8, "eval_clip_ratio": 0.0, "eval_completion_length": 603.9875, "eval_kl": 0.0187530517578125, "eval_loss": 0.0036536618135869503, "eval_reward": 11.611250162124634, "eval_reward_std": 5.22377119064331, "eval_rewards/accuracy_reward_staging": 0.9636250138282776, "eval_rewards/format_reward": 0.975, "eval_rewards/format_reward_staging": 1.0, "eval_runtime": 132.1448, "eval_samples_per_second": 0.151, "eval_steps_per_second": 0.038, "step": 40 }, { "clip_ratio": 0.0, "completion_length": 634.0, "epoch": 40.8, "grad_norm": 1.2640957862139377, "kl": 0.018407821655273438, "learning_rate": 1.999961923064171e-06, "loss": -0.0634, "reward": 11.232812687754631, "reward_std": 5.111758019775152, "rewards/accuracy_reward_staging": 0.9334375113248825, "rewards/format_reward": 0.9296875, "rewards/format_reward_staging": 0.96875, "step": 41 }, { "clip_ratio": 0.0, "completion_length": 635.765625, "epoch": 41.8, "grad_norm": 1.268606166424927, "kl": 0.01999664306640625, "learning_rate": 1.9998476951563913e-06, "loss": 0.0283, "reward": 12.45000010728836, "reward_std": 4.9740989953279495, "rewards/accuracy_reward_staging": 1.0450000185519457, "rewards/format_reward": 1.0, "rewards/format_reward_staging": 1.0, "step": 42 }, { "clip_ratio": 0.0, "completion_length": 594.1875, "epoch": 42.8, "grad_norm": 1.3740022650965131, "kl": 0.020366668701171875, "learning_rate": 1.999657324975557e-06, "loss": -0.0149, "reward": 11.234375149011612, "reward_std": 5.008681446313858, "rewards/accuracy_reward_staging": 0.9250000100582838, "rewards/format_reward": 1.0, "rewards/format_reward_staging": 0.984375, "step": 43 }, { "clip_ratio": 0.0, "completion_length": 622.640625, "epoch": 43.8, "grad_norm": 1.2026028918578335, "kl": 0.02169036865234375, "learning_rate": 1.9993908270190957e-06, "loss": 0.0018, "reward": 11.873437762260437, "reward_std": 4.005194254219532, "rewards/accuracy_reward_staging": 0.9920312594622374, "rewards/format_reward": 0.96875, "rewards/format_reward_staging": 0.984375, "step": 44 }, { "clip_ratio": 0.0, "completion_length": 637.0625, "epoch": 44.8, "grad_norm": 1.281104821624419, "kl": 0.022716522216796875, "learning_rate": 1.999048221581858e-06, "loss": 0.0455, "reward": 11.17031267285347, "reward_std": 4.456828519701958, "rewards/accuracy_reward_staging": 0.9201562497764826, "rewards/format_reward": 0.984375, "rewards/format_reward_staging": 0.984375, "step": 45 }, { "clip_ratio": 0.0, "completion_length": 652.5, "epoch": 45.8, "grad_norm": 1.3614150864044823, "kl": 0.0223236083984375, "learning_rate": 1.998629534754574e-06, "loss": 0.0205, "reward": 10.348437696695328, "reward_std": 4.60803659260273, "rewards/accuracy_reward_staging": 0.8426562454551458, "rewards/format_reward": 0.9375, "rewards/format_reward_staging": 0.984375, "step": 46 }, { "clip_ratio": 0.0, "completion_length": 626.359375, "epoch": 46.8, "grad_norm": 1.245324461252277, "kl": 0.0242156982421875, "learning_rate": 1.9981347984218667e-06, "loss": 0.0056, "reward": 13.950000077486038, "reward_std": 5.2063538283109665, "rewards/accuracy_reward_staging": 1.2059375159442425, "rewards/format_reward": 0.9375, "rewards/format_reward_staging": 0.953125, "step": 47 }, { "clip_ratio": 0.0, "completion_length": 593.96875, "epoch": 47.8, "grad_norm": 1.3313171277926144, "kl": 0.02922821044921875, "learning_rate": 1.997564050259824e-06, "loss": 0.0449, "reward": 12.739062905311584, "reward_std": 4.18791925907135, "rewards/accuracy_reward_staging": 1.075468771159649, "rewards/format_reward": 0.984375, "rewards/format_reward_staging": 1.0, "step": 48 }, { "clip_ratio": 0.0, "completion_length": 580.21875, "epoch": 48.8, "grad_norm": 1.3629349178288628, "kl": 0.03372955322265625, "learning_rate": 1.996917333733128e-06, "loss": 0.0174, "reward": 11.535937696695328, "reward_std": 3.948319137096405, "rewards/accuracy_reward_staging": 0.9614062653854489, "rewards/format_reward": 0.953125, "rewards/format_reward_staging": 0.96875, "step": 49 }, { "epoch": 49.8, "grad_norm": 1.210177545645759, "learning_rate": 1.9961946980917456e-06, "loss": 0.0148, "step": 50 }, { "epoch": 49.8, "eval_clip_ratio": 0.0, "eval_completion_length": 681.0125, "eval_kl": 0.0308837890625, "eval_loss": 0.084346242249012, "eval_reward": 12.700000238418578, "eval_reward_std": 4.6470307350158695, "eval_rewards/accuracy_reward_staging": 1.0800000175833702, "eval_rewards/format_reward": 0.9375, "eval_rewards/format_reward_staging": 0.9625, "eval_runtime": 184.5613, "eval_samples_per_second": 0.108, "eval_steps_per_second": 0.027, "step": 50 }, { "clip_ratio": 0.0, "completion_length": 635.09375, "epoch": 50.8, "grad_norm": 1.3066463171281288, "kl": 0.033504486083984375, "learning_rate": 1.9953961983671786e-06, "loss": 0.026, "reward": 11.925000175833702, "reward_std": 5.033060222864151, "rewards/accuracy_reward_staging": 0.9956250190734863, "rewards/format_reward": 0.9765625, "rewards/format_reward_staging": 0.9921875, "step": 51 }, { "clip_ratio": 0.0, "completion_length": 633.59375, "epoch": 51.8, "grad_norm": 1.4900533150827393, "kl": 0.040374755859375, "learning_rate": 1.994521895368273e-06, "loss": 0.0244, "reward": 11.634375095367432, "reward_std": 4.955964259803295, "rewards/accuracy_reward_staging": 0.9665625263005495, "rewards/format_reward": 0.984375, "rewards/format_reward_staging": 0.984375, "step": 52 }, { "clip_ratio": 0.0, "completion_length": 642.84375, "epoch": 52.8, "grad_norm": 1.3018535470423898, "kl": 0.0359039306640625, "learning_rate": 1.9935718556765874e-06, "loss": 0.0176, "reward": 13.220312714576721, "reward_std": 6.300683185458183, "rewards/accuracy_reward_staging": 1.1282812524586916, "rewards/format_reward": 0.96875, "rewards/format_reward_staging": 0.96875, "step": 53 }, { "clip_ratio": 0.0, "completion_length": 625.59375, "epoch": 53.8, "grad_norm": 1.1747250249399175, "kl": 0.0344696044921875, "learning_rate": 1.992546151641322e-06, "loss": 0.0279, "reward": 12.729687660932541, "reward_std": 3.8526118397712708, "rewards/accuracy_reward_staging": 1.0854687504470348, "rewards/format_reward": 0.90625, "rewards/format_reward_staging": 0.96875, "step": 54 }, { "clip_ratio": 0.0, "completion_length": 616.421875, "epoch": 54.8, "grad_norm": 1.4193243535352469, "kl": 0.0395355224609375, "learning_rate": 1.9914448613738106e-06, "loss": 0.0064, "reward": 13.129687666893005, "reward_std": 5.78121767193079, "rewards/accuracy_reward_staging": 1.1223437692970037, "rewards/format_reward": 0.9375, "rewards/format_reward_staging": 0.96875, "step": 55 }, { "clip_ratio": 0.0, "completion_length": 593.609375, "epoch": 55.8, "grad_norm": 1.3523573855480968, "kl": 0.04061126708984375, "learning_rate": 1.99026806874157e-06, "loss": 0.0142, "reward": 13.071875303983688, "reward_std": 6.487003266811371, "rewards/accuracy_reward_staging": 1.1118750274181366, "rewards/format_reward": 0.96875, "rewards/format_reward_staging": 0.984375, "step": 56 }, { "clip_ratio": 0.0, "completion_length": 606.71875, "epoch": 56.8, "grad_norm": 1.295861523746061, "kl": 0.04427337646484375, "learning_rate": 1.989015863361917e-06, "loss": 0.0139, "reward": 13.359375238418579, "reward_std": 6.011465005576611, "rewards/accuracy_reward_staging": 1.1375000104308128, "rewards/format_reward": 0.984375, "rewards/format_reward_staging": 1.0, "step": 57 }, { "clip_ratio": 0.0, "completion_length": 632.140625, "epoch": 57.8, "grad_norm": 1.358415139567083, "kl": 0.0454254150390625, "learning_rate": 1.9876883405951377e-06, "loss": 0.0217, "reward": 12.626562535762787, "reward_std": 4.465259864926338, "rewards/accuracy_reward_staging": 1.0657812729477882, "rewards/format_reward": 0.96875, "rewards/format_reward_staging": 1.0, "step": 58 }, { "clip_ratio": 0.0, "completion_length": 598.5, "epoch": 58.8, "grad_norm": 1.3223218232049598, "kl": 0.046783447265625, "learning_rate": 1.986285601537231e-06, "loss": 0.0257, "reward": 12.182812690734863, "reward_std": 6.196883611381054, "rewards/accuracy_reward_staging": 1.0292187482118607, "rewards/format_reward": 0.9375, "rewards/format_reward_staging": 0.953125, "step": 59 }, { "epoch": 59.8, "grad_norm": 1.2192909123573732, "learning_rate": 1.984807753012208e-06, "loss": 0.0537, "step": 60 }, { "epoch": 59.8, "eval_clip_ratio": 0.0, "eval_completion_length": 641.15, "eval_kl": 0.04580078125, "eval_loss": -0.05469979718327522, "eval_reward": 13.055000233650208, "eval_reward_std": 5.354214292764664, "eval_rewards/accuracy_reward_staging": 1.1130000218749045, "eval_rewards/format_reward": 0.9625, "eval_rewards/format_reward_staging": 0.9625, "eval_runtime": 152.2218, "eval_samples_per_second": 0.131, "eval_steps_per_second": 0.033, "step": 60 }, { "clip_ratio": 0.0, "completion_length": 637.5546875, "epoch": 60.8, "grad_norm": 1.2273226448891168, "kl": 0.046173095703125, "learning_rate": 1.9832549075639547e-06, "loss": -0.0281, "reward": 12.067969009280205, "reward_std": 5.051002878695726, "rewards/accuracy_reward_staging": 1.0114843952469528, "rewards/format_reward": 0.96875, "rewards/format_reward_staging": 0.984375, "step": 61 }, { "clip_ratio": 0.0, "completion_length": 626.640625, "epoch": 61.8, "grad_norm": 1.2972851245460888, "kl": 0.062713623046875, "learning_rate": 1.981627183447664e-06, "loss": 0.0389, "reward": 12.120312690734863, "reward_std": 3.9745979011058807, "rewards/accuracy_reward_staging": 1.0229687616229057, "rewards/format_reward": 0.9375, "rewards/format_reward_staging": 0.953125, "step": 62 }, { "clip_ratio": 0.0, "completion_length": 669.671875, "epoch": 62.8, "grad_norm": 1.433658602561295, "kl": 0.05224609375, "learning_rate": 1.9799247046208295e-06, "loss": 0.0548, "reward": 13.040625154972076, "reward_std": 5.295711062848568, "rewards/accuracy_reward_staging": 1.1118750162422657, "rewards/format_reward": 0.953125, "rewards/format_reward_staging": 0.96875, "step": 63 }, { "clip_ratio": 0.0, "completion_length": 618.8125, "epoch": 63.8, "grad_norm": 1.2699526702646982, "kl": 0.0532379150390625, "learning_rate": 1.9781476007338054e-06, "loss": 0.0405, "reward": 12.978125303983688, "reward_std": 5.858064912259579, "rewards/accuracy_reward_staging": 1.100937519222498, "rewards/format_reward": 0.984375, "rewards/format_reward_staging": 0.984375, "step": 64 }, { "clip_ratio": 0.0, "completion_length": 622.25, "epoch": 64.8, "grad_norm": 1.2887790245964135, "kl": 0.0642242431640625, "learning_rate": 1.976296007119933e-06, "loss": 0.0309, "reward": 13.806250274181366, "reward_std": 5.3699341379106045, "rewards/accuracy_reward_staging": 1.1900000236928463, "rewards/format_reward": 0.9375, "rewards/format_reward_staging": 0.96875, "step": 65 }, { "clip_ratio": 0.0, "completion_length": 626.75, "epoch": 65.8, "grad_norm": 1.2658566344825595, "kl": 0.0558319091796875, "learning_rate": 1.9743700647852355e-06, "loss": -0.0173, "reward": 12.885937601327896, "reward_std": 5.130606591701508, "rewards/accuracy_reward_staging": 1.0917187500745058, "rewards/format_reward": 0.96875, "rewards/format_reward_staging": 1.0, "step": 66 }, { "clip_ratio": 0.0, "completion_length": 603.84375, "epoch": 66.8, "grad_norm": 1.2421251352393117, "kl": 0.0610198974609375, "learning_rate": 1.9723699203976766e-06, "loss": 0.0279, "reward": 12.806250214576721, "reward_std": 4.9495924392249435, "rewards/accuracy_reward_staging": 1.0806250050663948, "rewards/format_reward": 1.0, "rewards/format_reward_staging": 1.0, "step": 67 }, { "clip_ratio": 0.0, "completion_length": 610.90625, "epoch": 67.8, "grad_norm": 1.2743636487669978, "kl": 0.07061767578125, "learning_rate": 1.9702957262759963e-06, "loss": 0.0096, "reward": 12.381250366568565, "reward_std": 4.895804196596146, "rewards/accuracy_reward_staging": 1.0443749986588955, "rewards/format_reward": 0.953125, "rewards/format_reward_staging": 0.984375, "step": 68 }, { "clip_ratio": 0.0, "completion_length": 578.734375, "epoch": 68.8, "grad_norm": 19.774598485500963, "kl": 0.20587158203125, "learning_rate": 1.9681476403781077e-06, "loss": 0.0525, "reward": 13.853125363588333, "reward_std": 4.133783400058746, "rewards/accuracy_reward_staging": 1.1900000181049109, "rewards/format_reward": 0.96875, "rewards/format_reward_staging": 0.984375, "step": 69 }, { "epoch": 69.8, "grad_norm": 1.2630722978620754, "learning_rate": 1.965925826289068e-06, "loss": -0.05, "step": 70 }, { "epoch": 69.8, "eval_clip_ratio": 0.0, "eval_completion_length": 644.375, "eval_kl": 0.06834716796875, "eval_loss": 0.05038486793637276, "eval_reward": 13.201250171661377, "eval_reward_std": 5.853598284721374, "eval_rewards/accuracy_reward_staging": 1.1376250192523003, "eval_rewards/format_reward": 0.8875, "eval_rewards/format_reward_staging": 0.9375, "eval_runtime": 152.6561, "eval_samples_per_second": 0.131, "eval_steps_per_second": 0.033, "step": 70 }, { "clip_ratio": 0.0, "completion_length": 617.921875, "epoch": 70.8, "grad_norm": 1.3309524076426662, "kl": 0.0696563720703125, "learning_rate": 1.963630453208623e-06, "loss": 0.0613, "reward": 13.469531431794167, "reward_std": 5.184730686247349, "rewards/accuracy_reward_staging": 1.1508593847975135, "rewards/format_reward": 0.9765625, "rewards/format_reward_staging": 0.984375, "step": 71 }, { "clip_ratio": 0.0, "completion_length": 673.703125, "epoch": 71.8, "grad_norm": 1.3030149455750017, "kl": 0.0673980712890625, "learning_rate": 1.9612616959383188e-06, "loss": 0.0537, "reward": 14.193750262260437, "reward_std": 4.9487489387393, "rewards/accuracy_reward_staging": 1.2318750098347664, "rewards/format_reward": 0.90625, "rewards/format_reward_staging": 0.96875, "step": 72 }, { "clip_ratio": 0.0, "completion_length": 646.5625, "epoch": 72.8, "grad_norm": 1.2069285825783527, "kl": 0.0667724609375, "learning_rate": 1.958819734868193e-06, "loss": 0.0452, "reward": 13.829687654972076, "reward_std": 4.151405468583107, "rewards/accuracy_reward_staging": 1.1845312640070915, "rewards/format_reward": 0.984375, "rewards/format_reward_staging": 1.0, "step": 73 }, { "clip_ratio": 0.0, "completion_length": 675.25, "epoch": 73.8, "grad_norm": 12.30456605431996, "kl": 0.144561767578125, "learning_rate": 1.9563047559630356e-06, "loss": 0.0238, "reward": 15.618750125169754, "reward_std": 5.085296101868153, "rewards/accuracy_reward_staging": 1.3665625024586916, "rewards/format_reward": 0.96875, "rewards/format_reward_staging": 0.984375, "step": 74 }, { "clip_ratio": 0.0, "completion_length": 627.640625, "epoch": 74.8, "grad_norm": 435.4454579019399, "kl": 2.6427154541015625, "learning_rate": 1.953716950748227e-06, "loss": 0.1019, "reward": 14.799999952316284, "reward_std": 4.404626630246639, "rewards/accuracy_reward_staging": 1.2831250056624413, "rewards/format_reward": 0.96875, "rewards/format_reward_staging": 1.0, "step": 75 }, { "clip_ratio": 0.0, "completion_length": 591.453125, "epoch": 75.8, "grad_norm": 1.5728290641754585, "kl": 0.0894012451171875, "learning_rate": 1.9510565162951534e-06, "loss": 0.0154, "reward": 14.187500238418579, "reward_std": 4.889563232660294, "rewards/accuracy_reward_staging": 1.2265625279396772, "rewards/format_reward": 0.9375, "rewards/format_reward_staging": 0.984375, "step": 76 }, { "clip_ratio": 0.0, "completion_length": 618.296875, "epoch": 76.8, "grad_norm": 2.0802354331013824, "kl": 0.113311767578125, "learning_rate": 1.948323655206199e-06, "loss": 0.031, "reward": 14.854687571525574, "reward_std": 4.158232696354389, "rewards/accuracy_reward_staging": 1.2885937616229057, "rewards/format_reward": 0.984375, "rewards/format_reward_staging": 0.984375, "step": 77 }, { "clip_ratio": 0.0, "completion_length": 652.46875, "epoch": 77.8, "grad_norm": 1.388498981753556, "kl": 0.086456298828125, "learning_rate": 1.945518575599317e-06, "loss": 0.0197, "reward": 14.209375262260437, "reward_std": 5.606824688613415, "rewards/accuracy_reward_staging": 1.2318750135600567, "rewards/format_reward": 0.921875, "rewards/format_reward_staging": 0.96875, "step": 78 }, { "clip_ratio": 0.0, "completion_length": 604.703125, "epoch": 78.8, "grad_norm": 1.2332136505931421, "kl": 0.080596923828125, "learning_rate": 1.9426414910921785e-06, "loss": 0.0222, "reward": 14.440624922513962, "reward_std": 4.533382810652256, "rewards/accuracy_reward_staging": 1.247187502682209, "rewards/format_reward": 0.984375, "rewards/format_reward_staging": 0.984375, "step": 79 }, { "epoch": 79.8, "grad_norm": 1.2077461933733504, "learning_rate": 1.9396926207859082e-06, "loss": 0.0239, "step": 80 }, { "epoch": 79.8, "eval_clip_ratio": 0.0, "eval_completion_length": 592.1125, "eval_kl": 0.077099609375, "eval_loss": 0.03515242785215378, "eval_reward": 14.468750166893006, "eval_reward_std": 4.639398086071014, "eval_rewards/accuracy_reward_staging": 1.2506250083446502, "eval_rewards/format_reward": 0.9875, "eval_rewards/format_reward_staging": 0.975, "eval_runtime": 133.8538, "eval_samples_per_second": 0.149, "eval_steps_per_second": 0.037, "step": 80 }, { "clip_ratio": 0.0, "completion_length": 644.046875, "epoch": 80.8, "grad_norm": 1.673636310153387, "kl": 0.07987213134765625, "learning_rate": 1.9366721892483973e-06, "loss": 0.0333, "reward": 14.308594018220901, "reward_std": 3.8065029891440645, "rewards/accuracy_reward_staging": 1.233984388411045, "rewards/format_reward": 0.9765625, "rewards/format_reward_staging": 0.9921875, "step": 81 }, { "clip_ratio": 0.0, "completion_length": 644.65625, "epoch": 81.8, "grad_norm": 1.2475722094071593, "kl": 0.07330322265625, "learning_rate": 1.9335804264972015e-06, "loss": -0.0326, "reward": 12.793750315904617, "reward_std": 4.888111189007759, "rewards/accuracy_reward_staging": 1.080937497317791, "rewards/format_reward": 0.984375, "rewards/format_reward_staging": 1.0, "step": 82 }, { "clip_ratio": 0.0, "completion_length": 600.578125, "epoch": 82.8, "grad_norm": 1.1629304999796195, "kl": 0.0791168212890625, "learning_rate": 1.9304175679820247e-06, "loss": 0.0416, "reward": 12.628125369548798, "reward_std": 4.35176794230938, "rewards/accuracy_reward_staging": 1.0706250071525574, "rewards/format_reward": 0.9375, "rewards/format_reward_staging": 0.984375, "step": 83 }, { "clip_ratio": 0.0, "completion_length": 557.15625, "epoch": 83.8, "grad_norm": 1.439668786268354, "kl": 0.084381103515625, "learning_rate": 1.9271838545667875e-06, "loss": 0.0776, "reward": 12.189062774181366, "reward_std": 4.2925035655498505, "rewards/accuracy_reward_staging": 1.0235937517136335, "rewards/format_reward": 0.96875, "rewards/format_reward_staging": 0.984375, "step": 84 }, { "clip_ratio": 0.0, "completion_length": 594.0625, "epoch": 84.8, "grad_norm": 1.3254589188157202, "kl": 0.0747528076171875, "learning_rate": 1.9238795325112867e-06, "loss": 0.0619, "reward": 15.909375101327896, "reward_std": 5.054341539740562, "rewards/accuracy_reward_staging": 1.3925000187009573, "rewards/format_reward": 0.984375, "rewards/format_reward_staging": 1.0, "step": 85 }, { "clip_ratio": 0.0, "completion_length": 595.28125, "epoch": 85.8, "grad_norm": 1.2110318535178632, "kl": 0.0708465576171875, "learning_rate": 1.9205048534524403e-06, "loss": 0.0277, "reward": 13.023437589406967, "reward_std": 4.805721327662468, "rewards/accuracy_reward_staging": 1.1070312801748514, "rewards/format_reward": 0.96875, "rewards/format_reward_staging": 0.984375, "step": 86 }, { "clip_ratio": 0.0, "completion_length": 613.15625, "epoch": 86.8, "grad_norm": 1.3901357478872436, "kl": 0.0800323486328125, "learning_rate": 1.917060074385124e-06, "loss": 0.0142, "reward": 14.523437321186066, "reward_std": 4.984271876513958, "rewards/accuracy_reward_staging": 1.257031261920929, "rewards/format_reward": 0.96875, "rewards/format_reward_staging": 0.984375, "step": 87 }, { "clip_ratio": 0.0, "completion_length": 603.078125, "epoch": 87.8, "grad_norm": 1.338201226896483, "kl": 0.0803375244140625, "learning_rate": 1.9135454576426007e-06, "loss": 0.0304, "reward": 14.829687654972076, "reward_std": 6.223069980740547, "rewards/accuracy_reward_staging": 1.2939062640070915, "rewards/format_reward": 0.9375, "rewards/format_reward_staging": 0.953125, "step": 88 }, { "clip_ratio": 0.0, "completion_length": 622.9375, "epoch": 88.8, "grad_norm": 2.521080873125213, "kl": 0.1446990966796875, "learning_rate": 1.909961270876543e-06, "loss": 0.0222, "reward": 14.043749928474426, "reward_std": 5.054637104272842, "rewards/accuracy_reward_staging": 1.2106250263750553, "rewards/format_reward": 0.96875, "rewards/format_reward_staging": 0.96875, "step": 89 }, { "epoch": 89.8, "grad_norm": 1.20377714499055, "learning_rate": 1.9063077870366499e-06, "loss": 0.048, "step": 90 }, { "epoch": 89.8, "eval_clip_ratio": 0.0, "eval_completion_length": 648.2375, "eval_kl": 0.078076171875, "eval_loss": 0.07422037422657013, "eval_reward": 14.006250190734864, "eval_reward_std": 4.856718444824219, "eval_rewards/accuracy_reward_staging": 1.2081250160932542, "eval_rewards/format_reward": 0.95, "eval_rewards/format_reward_staging": 0.975, "eval_runtime": 145.5911, "eval_samples_per_second": 0.137, "eval_steps_per_second": 0.034, "step": 90 }, { "clip_ratio": 0.0, "completion_length": 618.7265625, "epoch": 90.8, "grad_norm": 1.151785130187796, "kl": 0.0818328857421875, "learning_rate": 1.9025852843498606e-06, "loss": -0.0394, "reward": 14.92031255364418, "reward_std": 5.041832268238068, "rewards/accuracy_reward_staging": 1.2959375083446503, "rewards/format_reward": 0.9765625, "rewards/format_reward_staging": 0.984375, "step": 91 }, { "clip_ratio": 0.0, "completion_length": 650.859375, "epoch": 91.8, "grad_norm": 1.0808576276629485, "kl": 0.0752716064453125, "learning_rate": 1.8987940462991669e-06, "loss": 0.0142, "reward": 13.932812631130219, "reward_std": 5.106485404074192, "rewards/accuracy_reward_staging": 1.1948437727987766, "rewards/format_reward": 0.984375, "rewards/format_reward_staging": 1.0, "step": 92 }, { "clip_ratio": 0.0, "completion_length": 653.96875, "epoch": 92.8, "grad_norm": 1.1761749428173782, "kl": 0.0748443603515625, "learning_rate": 1.894934361602025e-06, "loss": 0.061, "reward": 13.756249964237213, "reward_std": 4.668057285249233, "rewards/accuracy_reward_staging": 1.185000006109476, "rewards/format_reward": 0.9375, "rewards/format_reward_staging": 0.96875, "step": 93 }, { "clip_ratio": 0.0, "completion_length": 643.140625, "epoch": 93.8, "grad_norm": 1.1826779106192027, "kl": 0.08154296875, "learning_rate": 1.8910065241883678e-06, "loss": 0.0113, "reward": 15.871875286102295, "reward_std": 5.009915418922901, "rewards/accuracy_reward_staging": 1.3918750323355198, "rewards/format_reward": 0.96875, "rewards/format_reward_staging": 0.984375, "step": 94 }, { "clip_ratio": 0.0, "completion_length": 640.71875, "epoch": 94.8, "grad_norm": 1.220691618497292, "kl": 0.087249755859375, "learning_rate": 1.8870108331782216e-06, "loss": 0.0364, "reward": 15.275000274181366, "reward_std": 5.186372339725494, "rewards/accuracy_reward_staging": 1.3353125043213367, "rewards/format_reward": 0.96875, "rewards/format_reward_staging": 0.953125, "step": 95 }, { "clip_ratio": 0.0, "completion_length": 638.9375, "epoch": 95.8, "grad_norm": 1.262017317075815, "kl": 0.092254638671875, "learning_rate": 1.8829475928589268e-06, "loss": 0.0112, "reward": 11.41250017285347, "reward_std": 5.790772080421448, "rewards/accuracy_reward_staging": 0.9459375087171793, "rewards/format_reward": 0.96875, "rewards/format_reward_staging": 0.984375, "step": 96 }, { "clip_ratio": 0.0, "completion_length": 650.140625, "epoch": 96.8, "grad_norm": 1.555339606868479, "kl": 0.088897705078125, "learning_rate": 1.8788171126619653e-06, "loss": 0.0167, "reward": 13.132812529802322, "reward_std": 5.363104030489922, "rewards/accuracy_reward_staging": 1.1226562578231096, "rewards/format_reward": 0.9375, "rewards/format_reward_staging": 0.96875, "step": 97 }, { "clip_ratio": 0.0, "completion_length": 685.40625, "epoch": 97.8, "grad_norm": 1.1855081064145405, "kl": 0.0860595703125, "learning_rate": 1.8746197071393956e-06, "loss": -0.0101, "reward": 14.51250010728836, "reward_std": 5.773593910038471, "rewards/accuracy_reward_staging": 1.2575000114738941, "rewards/format_reward": 0.96875, "rewards/format_reward_staging": 0.96875, "step": 98 }, { "clip_ratio": 0.0, "completion_length": 632.890625, "epoch": 98.8, "grad_norm": 1.2154159531082918, "kl": 0.092254638671875, "learning_rate": 1.8703556959398995e-06, "loss": 0.0378, "reward": 13.392187654972076, "reward_std": 5.218963444232941, "rewards/accuracy_reward_staging": 1.1423437520861626, "rewards/format_reward": 0.984375, "rewards/format_reward_staging": 0.984375, "step": 99 }, { "epoch": 99.8, "grad_norm": 1.2652701809880356, "learning_rate": 1.8660254037844386e-06, "loss": 0.0158, "step": 100 }, { "epoch": 99.8, "eval_clip_ratio": 0.0, "eval_completion_length": 596.325, "eval_kl": 0.170458984375, "eval_loss": 0.038617830723524094, "eval_reward": 13.322500276565552, "eval_reward_std": 4.48788731098175, "eval_rewards/accuracy_reward_staging": 1.1335000172257423, "eval_rewards/format_reward": 0.9875, "eval_rewards/format_reward_staging": 1.0, "eval_runtime": 137.9647, "eval_samples_per_second": 0.145, "eval_steps_per_second": 0.036, "step": 100 }, { "clip_ratio": 0.0, "completion_length": 625.546875, "epoch": 100.8, "grad_norm": 1.158984032803143, "kl": 0.0985870361328125, "learning_rate": 1.8616291604415257e-06, "loss": 0.0013, "reward": 13.735937476158142, "reward_std": 5.272327609360218, "rewards/accuracy_reward_staging": 1.174375013448298, "rewards/format_reward": 0.9921875, "rewards/format_reward_staging": 1.0, "step": 101 }, { "clip_ratio": 0.0, "completion_length": 610.265625, "epoch": 101.8, "grad_norm": 1.1842728351019054, "kl": 0.095916748046875, "learning_rate": 1.8571673007021123e-06, "loss": 0.0156, "reward": 15.284374952316284, "reward_std": 4.7574154287576675, "rewards/accuracy_reward_staging": 1.330000001937151, "rewards/format_reward": 0.984375, "rewards/format_reward_staging": 1.0, "step": 102 }, { "clip_ratio": 0.0, "completion_length": 640.171875, "epoch": 102.8, "grad_norm": 1.2460167784481468, "kl": 0.09686279296875, "learning_rate": 1.852640164354092e-06, "loss": -0.0181, "reward": 14.125000357627869, "reward_std": 4.396180346608162, "rewards/accuracy_reward_staging": 1.2203124929219484, "rewards/format_reward": 0.953125, "rewards/format_reward_staging": 0.96875, "step": 103 }, { "clip_ratio": 0.0, "completion_length": 632.03125, "epoch": 103.8, "grad_norm": 1.2093960835662856, "kl": 0.096099853515625, "learning_rate": 1.8480480961564257e-06, "loss": -0.0125, "reward": 15.537500262260437, "reward_std": 4.605620868504047, "rewards/accuracy_reward_staging": 1.3553125225007534, "rewards/format_reward": 0.984375, "rewards/format_reward_staging": 1.0, "step": 104 }, { "clip_ratio": 0.0, "completion_length": 626.1875, "epoch": 104.8, "grad_norm": 16.01421157394276, "kl": 0.21142578125, "learning_rate": 1.8433914458128857e-06, "loss": 0.0579, "reward": 13.903125166893005, "reward_std": 6.153450347483158, "rewards/accuracy_reward_staging": 1.1950000002980232, "rewards/format_reward": 0.96875, "rewards/format_reward_staging": 0.984375, "step": 105 }, { "clip_ratio": 0.0, "completion_length": 646.25, "epoch": 105.8, "grad_norm": 1.3180024354800577, "kl": 0.10040283203125, "learning_rate": 1.838670567945424e-06, "loss": 0.068, "reward": 13.818750381469727, "reward_std": 5.729592114686966, "rewards/accuracy_reward_staging": 1.189687505364418, "rewards/format_reward": 0.9375, "rewards/format_reward_staging": 0.984375, "step": 106 }, { "clip_ratio": 0.0, "completion_length": 607.875, "epoch": 106.8, "grad_norm": 2.9653782850707398, "kl": 0.14129638671875, "learning_rate": 1.833885822067168e-06, "loss": 0.0536, "reward": 15.423437595367432, "reward_std": 6.023381091654301, "rewards/accuracy_reward_staging": 1.3454687595367432, "rewards/format_reward": 0.96875, "rewards/format_reward_staging": 1.0, "step": 107 }, { "clip_ratio": 0.0, "completion_length": 644.546875, "epoch": 107.8, "grad_norm": 1.4820796355726387, "kl": 0.09906005859375, "learning_rate": 1.8290375725550415e-06, "loss": 0.097, "reward": 14.023437529802322, "reward_std": 6.225167877972126, "rewards/accuracy_reward_staging": 1.2054687719792128, "rewards/format_reward": 0.984375, "rewards/format_reward_staging": 0.984375, "step": 108 }, { "clip_ratio": 0.0, "completion_length": 585.25, "epoch": 108.8, "grad_norm": 1.653023129371335, "kl": 0.114166259765625, "learning_rate": 1.8241261886220154e-06, "loss": 0.0807, "reward": 14.356250017881393, "reward_std": 5.447244621813297, "rewards/accuracy_reward_staging": 1.2371875084936619, "rewards/format_reward": 0.984375, "rewards/format_reward_staging": 1.0, "step": 109 }, { "epoch": 109.8, "grad_norm": 1.2519961806572524, "learning_rate": 1.8191520442889917e-06, "loss": 0.0487, "step": 110 }, { "epoch": 109.8, "eval_clip_ratio": 0.0, "eval_completion_length": 607.9625, "eval_kl": 0.098583984375, "eval_loss": 0.015465144999325275, "eval_reward": 14.107500028610229, "eval_reward_std": 5.255042427778244, "eval_rewards/accuracy_reward_staging": 1.2157500088214874, "eval_rewards/format_reward": 0.9625, "eval_rewards/format_reward_staging": 0.9875, "eval_runtime": 141.9444, "eval_samples_per_second": 0.141, "eval_steps_per_second": 0.035, "step": 110 }, { "clip_ratio": 0.0, "completion_length": 595.484375, "epoch": 110.8, "grad_norm": 1.2811054745764274, "kl": 0.112457275390625, "learning_rate": 1.8141155183563193e-06, "loss": 0.0085, "reward": 14.736718833446503, "reward_std": 5.505104329437017, "rewards/accuracy_reward_staging": 1.2760156439617276, "rewards/format_reward": 0.984375, "rewards/format_reward_staging": 0.9921875, "step": 111 }, { "clip_ratio": 0.0, "completion_length": 589.59375, "epoch": 111.8, "grad_norm": 1.2357687466322653, "kl": 0.1142578125, "learning_rate": 1.8090169943749474e-06, "loss": -0.0064, "reward": 13.935937494039536, "reward_std": 4.747958414256573, "rewards/accuracy_reward_staging": 1.195156266912818, "rewards/format_reward": 1.0, "rewards/format_reward_staging": 0.984375, "step": 112 }, { "clip_ratio": 0.0, "completion_length": 604.953125, "epoch": 112.8, "grad_norm": 1.4229471167256136, "kl": 0.14752197265625, "learning_rate": 1.803856860617217e-06, "loss": 0.0281, "reward": 13.79843756556511, "reward_std": 5.385790981352329, "rewards/accuracy_reward_staging": 1.1845312714576721, "rewards/format_reward": 0.984375, "rewards/format_reward_staging": 0.96875, "step": 113 }, { "clip_ratio": 0.0, "completion_length": 611.328125, "epoch": 113.8, "grad_norm": 24.422765952176345, "kl": 0.330352783203125, "learning_rate": 1.7986355100472927e-06, "loss": 0.0504, "reward": 14.092187762260437, "reward_std": 5.095987647771835, "rewards/accuracy_reward_staging": 1.2107812762260437, "rewards/format_reward": 1.0, "rewards/format_reward_staging": 0.984375, "step": 114 }, { "clip_ratio": 0.0, "completion_length": 567.453125, "epoch": 114.8, "grad_norm": 5.413230038864145, "kl": 0.17083740234375, "learning_rate": 1.7933533402912351e-06, "loss": 0.0736, "reward": 13.521874904632568, "reward_std": 4.76890967041254, "rewards/accuracy_reward_staging": 1.1584375277161598, "rewards/format_reward": 0.96875, "rewards/format_reward_staging": 0.96875, "step": 115 }, { "clip_ratio": 0.0, "completion_length": 589.703125, "epoch": 115.8, "grad_norm": 1.9014947184206379, "kl": 0.17449951171875, "learning_rate": 1.7880107536067217e-06, "loss": 0.0221, "reward": 12.971875101327896, "reward_std": 5.6128582283854485, "rewards/accuracy_reward_staging": 1.1065624989569187, "rewards/format_reward": 0.953125, "rewards/format_reward_staging": 0.953125, "step": 116 }, { "clip_ratio": 0.0, "completion_length": 593.71875, "epoch": 116.8, "grad_norm": 5.015821786557117, "kl": 0.3743896484375, "learning_rate": 1.7826081568524138e-06, "loss": 0.0006, "reward": 14.44375005364418, "reward_std": 5.561103023588657, "rewards/accuracy_reward_staging": 1.247500006109476, "rewards/format_reward": 0.96875, "rewards/format_reward_staging": 1.0, "step": 117 }, { "clip_ratio": 0.0, "completion_length": 541.34375, "epoch": 117.8, "grad_norm": 2.862584895806164, "kl": 0.159332275390625, "learning_rate": 1.7771459614569707e-06, "loss": -0.0004, "reward": 13.903124928474426, "reward_std": 4.877812258899212, "rewards/accuracy_reward_staging": 1.1950000133365393, "rewards/format_reward": 1.0, "rewards/format_reward_staging": 0.953125, "step": 118 }, { "clip_ratio": 0.0, "completion_length": 552.203125, "epoch": 118.8, "grad_norm": 1.2925016251536252, "kl": 0.136077880859375, "learning_rate": 1.7716245833877198e-06, "loss": 0.0437, "reward": 14.979687631130219, "reward_std": 5.167752608656883, "rewards/accuracy_reward_staging": 1.3042187504470348, "rewards/format_reward": 0.96875, "rewards/format_reward_staging": 0.96875, "step": 119 }, { "epoch": 119.8, "grad_norm": 1.268704610469971, "learning_rate": 1.766044443118978e-06, "loss": 0.0381, "step": 120 }, { "epoch": 119.8, "eval_clip_ratio": 0.0, "eval_completion_length": 613.875, "eval_kl": 0.1421142578125, "eval_loss": 0.042472995817661285, "eval_reward": 14.473749923706055, "eval_reward_std": 4.867543476819992, "eval_rewards/accuracy_reward_staging": 1.2536250054836273, "eval_rewards/format_reward": 0.95, "eval_rewards/format_reward_staging": 0.9875, "eval_runtime": 140.6458, "eval_samples_per_second": 0.142, "eval_steps_per_second": 0.036, "step": 120 }, { "clip_ratio": 0.0, "completion_length": 632.1796875, "epoch": 120.8, "grad_norm": 1.232115329282542, "kl": 0.1269073486328125, "learning_rate": 1.760405965600031e-06, "loss": 0.0088, "reward": 14.570312559604645, "reward_std": 5.063761539757252, "rewards/accuracy_reward_staging": 1.262500001117587, "rewards/format_reward": 0.96875, "rewards/format_reward_staging": 0.9765625, "step": 121 }, { "clip_ratio": 0.0, "completion_length": 631.671875, "epoch": 121.8, "grad_norm": 1.3808041357571472, "kl": 0.14166259765625, "learning_rate": 1.7547095802227721e-06, "loss": 0.0158, "reward": 13.695312559604645, "reward_std": 5.737492188811302, "rewards/accuracy_reward_staging": 1.1742187663912773, "rewards/format_reward": 1.0, "rewards/format_reward_staging": 0.953125, "step": 122 }, { "clip_ratio": 0.0, "completion_length": 635.234375, "epoch": 122.8, "grad_norm": 1.247279598577896, "kl": 0.12890625, "learning_rate": 1.7489557207890023e-06, "loss": 0.0455, "reward": 12.946875035762787, "reward_std": 4.728762552142143, "rewards/accuracy_reward_staging": 1.100937519222498, "rewards/format_reward": 0.96875, "rewards/format_reward_staging": 0.96875, "step": 123 }, { "clip_ratio": 0.0, "completion_length": 578.359375, "epoch": 123.8, "grad_norm": 1.3012048204012603, "kl": 0.14288330078125, "learning_rate": 1.743144825477394e-06, "loss": 0.0237, "reward": 14.440625131130219, "reward_std": 5.539311669766903, "rewards/accuracy_reward_staging": 1.2471875082701445, "rewards/format_reward": 0.96875, "rewards/format_reward_staging": 1.0, "step": 124 }, { "clip_ratio": 0.0, "completion_length": 618.046875, "epoch": 124.8, "grad_norm": 1.3325921669841398, "kl": 0.1463623046875, "learning_rate": 1.737277336810124e-06, "loss": 0.0604, "reward": 12.951562702655792, "reward_std": 3.5424299761652946, "rewards/accuracy_reward_staging": 1.1014062613248825, "rewards/format_reward": 0.953125, "rewards/format_reward_staging": 0.984375, "step": 125 }, { "clip_ratio": 0.0, "completion_length": 600.65625, "epoch": 125.8, "grad_norm": 1.692915618997029, "kl": 0.158050537109375, "learning_rate": 1.7313537016191704e-06, "loss": 0.0314, "reward": 15.428125023841858, "reward_std": 5.270965404808521, "rewards/accuracy_reward_staging": 1.3459375128149986, "rewards/format_reward": 0.96875, "rewards/format_reward_staging": 1.0, "step": 126 }, { "clip_ratio": 0.0, "completion_length": 616.15625, "epoch": 126.8, "grad_norm": 1.1976264555767795, "kl": 0.1231689453125, "learning_rate": 1.7253743710122874e-06, "loss": -0.0521, "reward": 15.524999856948853, "reward_std": 4.5880225002765656, "rewards/accuracy_reward_staging": 1.3556249924004078, "rewards/format_reward": 0.96875, "rewards/format_reward_staging": 1.0, "step": 127 }, { "clip_ratio": 0.0, "completion_length": 636.265625, "epoch": 127.8, "grad_norm": 1.5736176734740694, "kl": 0.14605712890625, "learning_rate": 1.719339800338651e-06, "loss": -0.0053, "reward": 13.117187559604645, "reward_std": 4.341549597680569, "rewards/accuracy_reward_staging": 1.1273437663912773, "rewards/format_reward": 0.890625, "rewards/format_reward_staging": 0.953125, "step": 128 }, { "clip_ratio": 0.0, "completion_length": 646.5, "epoch": 128.8, "grad_norm": 2.8516785822154693, "kl": 0.19573974609375, "learning_rate": 1.7132504491541815e-06, "loss": -0.0363, "reward": 13.059375166893005, "reward_std": 4.451074585318565, "rewards/accuracy_reward_staging": 1.1121875122189522, "rewards/format_reward": 0.953125, "rewards/format_reward_staging": 0.984375, "step": 129 }, { "epoch": 129.8, "grad_norm": 1.330165854405246, "learning_rate": 1.7071067811865474e-06, "loss": 0.0375, "step": 130 }, { "epoch": 129.8, "eval_clip_ratio": 0.0, "eval_completion_length": 637.2125, "eval_kl": 0.1220947265625, "eval_loss": 0.0286346934735775, "eval_reward": 14.201250052452087, "eval_reward_std": 5.366847103834152, "eval_rewards/accuracy_reward_staging": 1.2251250058412553, "eval_rewards/format_reward": 0.9625, "eval_rewards/format_reward_staging": 0.9875, "eval_runtime": 144.6284, "eval_samples_per_second": 0.138, "eval_steps_per_second": 0.035, "step": 130 }, { "clip_ratio": 0.0, "completion_length": 584.5859375, "epoch": 130.8, "grad_norm": 1.307152401229373, "kl": 0.1373443603515625, "learning_rate": 1.7009092642998508e-06, "loss": -0.0099, "reward": 13.771875083446503, "reward_std": 5.035757407546043, "rewards/accuracy_reward_staging": 1.1803125254809856, "rewards/format_reward": 0.984375, "rewards/format_reward_staging": 0.984375, "step": 131 }, { "clip_ratio": 0.0, "completion_length": 645.203125, "epoch": 131.8, "grad_norm": 1.2708814145835345, "kl": 0.129486083984375, "learning_rate": 1.6946583704589972e-06, "loss": 0.0643, "reward": 12.623437643051147, "reward_std": 5.542073376476765, "rewards/accuracy_reward_staging": 1.0654687564820051, "rewards/format_reward": 1.0, "rewards/format_reward_staging": 0.96875, "step": 132 }, { "clip_ratio": 0.0, "completion_length": 600.453125, "epoch": 132.8, "grad_norm": 1.293321609591619, "kl": 0.135955810546875, "learning_rate": 1.6883545756937537e-06, "loss": -0.0023, "reward": 13.604687690734863, "reward_std": 5.285826697945595, "rewards/accuracy_reward_staging": 1.163593776524067, "rewards/format_reward": 0.96875, "rewards/format_reward_staging": 1.0, "step": 133 }, { "clip_ratio": 0.0, "completion_length": 622.75, "epoch": 133.8, "grad_norm": 1.25404998096487, "kl": 0.123260498046875, "learning_rate": 1.6819983600624985e-06, "loss": -0.0007, "reward": 13.665625005960464, "reward_std": 5.407533464720473, "rewards/accuracy_reward_staging": 1.1743750125169754, "rewards/format_reward": 0.953125, "rewards/format_reward_staging": 0.96875, "step": 134 }, { "clip_ratio": 0.0, "completion_length": 543.90625, "epoch": 134.8, "grad_norm": 1.4502660038061592, "kl": 0.146087646484375, "learning_rate": 1.6755902076156602e-06, "loss": 0.0388, "reward": 13.443750143051147, "reward_std": 6.642620116472244, "rewards/accuracy_reward_staging": 1.1490625031292439, "rewards/format_reward": 0.984375, "rewards/format_reward_staging": 0.96875, "step": 135 }, { "clip_ratio": 0.0, "completion_length": 652.296875, "epoch": 135.8, "grad_norm": 1.2487523655371195, "kl": 0.114105224609375, "learning_rate": 1.669130606358858e-06, "loss": 0.0277, "reward": 14.48281255364418, "reward_std": 5.782497301697731, "rewards/accuracy_reward_staging": 1.2576562836766243, "rewards/format_reward": 0.96875, "rewards/format_reward_staging": 0.9375, "step": 136 }, { "clip_ratio": 0.0, "completion_length": 624.75, "epoch": 136.8, "grad_norm": 1.3402764758528873, "kl": 0.133087158203125, "learning_rate": 1.6626200482157374e-06, "loss": 0.0515, "reward": 12.865624994039536, "reward_std": 4.773457303643227, "rewards/accuracy_reward_staging": 1.0959375277161598, "rewards/format_reward": 0.953125, "rewards/format_reward_staging": 0.953125, "step": 137 }, { "clip_ratio": 0.0, "completion_length": 581.875, "epoch": 137.8, "grad_norm": 1.2528044000047398, "kl": 0.121673583984375, "learning_rate": 1.6560590289905071e-06, "loss": 0.0046, "reward": 14.821875035762787, "reward_std": 5.201211467385292, "rewards/accuracy_reward_staging": 1.2837499883025885, "rewards/format_reward": 1.0, "rewards/format_reward_staging": 0.984375, "step": 138 }, { "clip_ratio": 0.0, "completion_length": 611.15625, "epoch": 138.8, "grad_norm": 1.6131227618811717, "kl": 0.118804931640625, "learning_rate": 1.6494480483301835e-06, "loss": 0.0186, "reward": 14.048437416553497, "reward_std": 4.07040748000145, "rewards/accuracy_reward_staging": 1.2157812491059303, "rewards/format_reward": 0.921875, "rewards/format_reward_staging": 0.96875, "step": 139 }, { "epoch": 139.8, "grad_norm": 1.348569648877767, "learning_rate": 1.6427876096865393e-06, "loss": 0.0477, "step": 140 }, { "epoch": 139.8, "eval_clip_ratio": 0.0, "eval_completion_length": 630.375, "eval_kl": 0.135888671875, "eval_loss": 0.021134015172719955, "eval_reward": 13.206250190734863, "eval_reward_std": 4.864674496650696, "eval_rewards/accuracy_reward_staging": 1.1293750241398812, "eval_rewards/format_reward": 0.95, "eval_rewards/format_reward_staging": 0.9625, "eval_runtime": 154.772, "eval_samples_per_second": 0.129, "eval_steps_per_second": 0.032, "step": 140 }, { "clip_ratio": 0.0, "completion_length": 667.46875, "epoch": 140.8, "grad_norm": 1.3550397685799231, "kl": 0.1424407958984375, "learning_rate": 1.6360782202777638e-06, "loss": 0.022, "reward": 13.272656485438347, "reward_std": 5.271991036832333, "rewards/accuracy_reward_staging": 1.135859395377338, "rewards/format_reward": 0.9375, "rewards/format_reward_staging": 0.9765625, "step": 141 }, { "clip_ratio": 0.0, "completion_length": 581.15625, "epoch": 141.8, "grad_norm": 1.274351995849414, "kl": 0.1263427734375, "learning_rate": 1.6293203910498375e-06, "loss": 0.0166, "reward": 13.015625029802322, "reward_std": 5.490728512406349, "rewards/accuracy_reward_staging": 1.1062499918043613, "rewards/format_reward": 0.953125, "rewards/format_reward_staging": 1.0, "step": 142 }, { "clip_ratio": 0.0, "completion_length": 581.9375, "epoch": 142.8, "grad_norm": 1.4551759286658215, "kl": 0.131011962890625, "learning_rate": 1.6225146366376196e-06, "loss": 0.0763, "reward": 13.957812488079071, "reward_std": 5.092830486595631, "rewards/accuracy_reward_staging": 1.2004687525331974, "rewards/format_reward": 0.96875, "rewards/format_reward_staging": 0.984375, "step": 143 }, { "clip_ratio": 0.0, "completion_length": 616.640625, "epoch": 143.8, "grad_norm": 1.246097895354405, "kl": 0.108062744140625, "learning_rate": 1.615661475325658e-06, "loss": 0.0785, "reward": 13.190625131130219, "reward_std": 4.567478813230991, "rewards/accuracy_reward_staging": 1.1268750242888927, "rewards/format_reward": 0.96875, "rewards/format_reward_staging": 0.953125, "step": 144 }, { "clip_ratio": 0.0, "completion_length": 641.4375, "epoch": 144.8, "grad_norm": 1.3492898310297994, "kl": 0.137298583984375, "learning_rate": 1.6087614290087205e-06, "loss": 0.0778, "reward": 13.187500149011612, "reward_std": 5.05699796974659, "rewards/accuracy_reward_staging": 1.1281250044703484, "rewards/format_reward": 0.921875, "rewards/format_reward_staging": 0.984375, "step": 145 }, { "clip_ratio": 0.0, "completion_length": 615.421875, "epoch": 145.8, "grad_norm": 1.2389330703315027, "kl": 0.11785888671875, "learning_rate": 1.6018150231520484e-06, "loss": -0.0105, "reward": 13.951562494039536, "reward_std": 4.975374720990658, "rewards/accuracy_reward_staging": 1.1951562650501728, "rewards/format_reward": 1.0, "rewards/format_reward_staging": 1.0, "step": 146 }, { "clip_ratio": 0.0, "completion_length": 628.953125, "epoch": 146.8, "grad_norm": 1.1484661979759183, "kl": 0.10455322265625, "learning_rate": 1.5948227867513413e-06, "loss": 0.0083, "reward": 13.078125178813934, "reward_std": 4.869858503341675, "rewards/accuracy_reward_staging": 1.117187526077032, "rewards/format_reward": 0.9375, "rewards/format_reward_staging": 0.96875, "step": 147 }, { "clip_ratio": 0.0, "completion_length": 644.734375, "epoch": 147.8, "grad_norm": 3.8787328503041603, "kl": 0.148834228515625, "learning_rate": 1.587785252292473e-06, "loss": 0.0525, "reward": 12.510937660932541, "reward_std": 5.20218176394701, "rewards/accuracy_reward_staging": 1.0557812713086605, "rewards/format_reward": 0.96875, "rewards/format_reward_staging": 0.984375, "step": 148 }, { "clip_ratio": 0.0, "completion_length": 622.6875, "epoch": 148.8, "grad_norm": 1.2193760476315514, "kl": 0.125579833984375, "learning_rate": 1.5807029557109397e-06, "loss": 0.0084, "reward": 13.721875101327896, "reward_std": 5.613097697496414, "rewards/accuracy_reward_staging": 1.180000003427267, "rewards/format_reward": 0.953125, "rewards/format_reward_staging": 0.96875, "step": 149 }, { "epoch": 149.8, "grad_norm": 1.3382966777749914, "learning_rate": 1.573576436351046e-06, "loss": 0.0283, "step": 150 }, { "epoch": 149.8, "eval_clip_ratio": 0.0, "eval_completion_length": 617.3625, "eval_kl": 0.1303466796875, "eval_loss": 0.013808819465339184, "eval_reward": 13.037500023841858, "eval_reward_std": 4.972468680143356, "eval_rewards/accuracy_reward_staging": 1.1087500318884849, "eval_rewards/format_reward": 0.9625, "eval_rewards/format_reward_staging": 0.9875, "eval_runtime": 145.0293, "eval_samples_per_second": 0.138, "eval_steps_per_second": 0.034, "step": 150 }, { "clip_ratio": 0.0, "completion_length": 587.78125, "epoch": 150.8, "grad_norm": 1.288894243312563, "kl": 0.124786376953125, "learning_rate": 1.5664062369248328e-06, "loss": 0.0259, "reward": 14.903124868869781, "reward_std": 6.114742249250412, "rewards/accuracy_reward_staging": 1.2934375274926424, "rewards/format_reward": 0.96875, "rewards/format_reward_staging": 1.0, "step": 151 }, { "clip_ratio": 0.0, "completion_length": 604.6875, "epoch": 151.8, "grad_norm": 1.3773815626279793, "kl": 0.126007080078125, "learning_rate": 1.5591929034707466e-06, "loss": 0.0712, "reward": 14.687500357627869, "reward_std": 5.71131344884634, "rewards/accuracy_reward_staging": 1.273437511175871, "rewards/format_reward": 0.984375, "rewards/format_reward_staging": 0.96875, "step": 152 }, { "clip_ratio": 0.0, "completion_length": 615.921875, "epoch": 152.8, "grad_norm": 1.4485533003534696, "kl": 0.135894775390625, "learning_rate": 1.551936985312058e-06, "loss": 0.0497, "reward": 15.312499761581421, "reward_std": 4.291183479130268, "rewards/accuracy_reward_staging": 1.3562500244006515, "rewards/format_reward": 0.828125, "rewards/format_reward_staging": 0.921875, "step": 153 }, { "clip_ratio": 0.0, "completion_length": 652.75, "epoch": 153.8, "grad_norm": 1.283755345470008, "kl": 0.14154052734375, "learning_rate": 1.544639035015027e-06, "loss": 0.0349, "reward": 13.10937511920929, "reward_std": 5.929120138287544, "rewards/accuracy_reward_staging": 1.1281250044703484, "rewards/format_reward": 0.859375, "rewards/format_reward_staging": 0.96875, "step": 154 }, { "clip_ratio": 0.0, "completion_length": 579.21875, "epoch": 154.8, "grad_norm": 4.3223327812781625, "kl": 0.21929931640625, "learning_rate": 1.537299608346824e-06, "loss": 0.0015, "reward": 16.134375244379044, "reward_std": 5.386517338454723, "rewards/accuracy_reward_staging": 1.4243750181049109, "rewards/format_reward": 0.921875, "rewards/format_reward_staging": 0.96875, "step": 155 }, { "clip_ratio": 0.0, "completion_length": 616.15625, "epoch": 155.8, "grad_norm": 1.600841914035174, "kl": 0.154449462890625, "learning_rate": 1.5299192642332049e-06, "loss": 0.0289, "reward": 14.656250029802322, "reward_std": 5.294310428202152, "rewards/accuracy_reward_staging": 1.2781250141561031, "rewards/format_reward": 0.890625, "rewards/format_reward_staging": 0.984375, "step": 156 }, { "clip_ratio": 0.0, "completion_length": 589.578125, "epoch": 156.8, "grad_norm": 1.4513044601818303, "kl": 0.16229248046875, "learning_rate": 1.5224985647159488e-06, "loss": 0.0441, "reward": 14.590624958276749, "reward_std": 5.033867612481117, "rewards/accuracy_reward_staging": 1.273124998435378, "rewards/format_reward": 0.921875, "rewards/format_reward_staging": 0.9375, "step": 157 }, { "clip_ratio": 0.0, "completion_length": 599.515625, "epoch": 157.8, "grad_norm": 1.3153063149563227, "kl": 0.148529052734375, "learning_rate": 1.5150380749100543e-06, "loss": 0.0618, "reward": 15.898437559604645, "reward_std": 4.4645668268203735, "rewards/accuracy_reward_staging": 1.4023437574505806, "rewards/format_reward": 0.90625, "rewards/format_reward_staging": 0.96875, "step": 158 }, { "clip_ratio": 0.0, "completion_length": 604.359375, "epoch": 158.8, "grad_norm": 8.479997272303532, "kl": 0.165130615234375, "learning_rate": 1.5075383629607041e-06, "loss": 0.0777, "reward": 13.142187595367432, "reward_std": 5.132370471954346, "rewards/accuracy_reward_staging": 1.132968744263053, "rewards/format_reward": 0.90625, "rewards/format_reward_staging": 0.90625, "step": 159 }, { "epoch": 159.8, "grad_norm": 1.1917014388947642, "learning_rate": 1.5e-06, "loss": -0.0404, "step": 160 }, { "epoch": 159.8, "eval_clip_ratio": 0.0, "eval_completion_length": 638.1625, "eval_kl": 0.14365234375, "eval_loss": 0.08888934552669525, "eval_reward": 14.796250009536744, "eval_reward_std": 5.661263364553451, "eval_rewards/accuracy_reward_staging": 1.295874996483326, "eval_rewards/format_reward": 0.9, "eval_rewards/format_reward_staging": 0.9375, "eval_runtime": 146.8527, "eval_samples_per_second": 0.136, "eval_steps_per_second": 0.034, "step": 160 }, { "clip_ratio": 0.0, "completion_length": 651.7421875, "epoch": 160.8, "grad_norm": 1.2387655968511657, "kl": 0.1446075439453125, "learning_rate": 1.4924235601034672e-06, "loss": 0.0701, "reward": 15.571094110608101, "reward_std": 5.18467765673995, "rewards/accuracy_reward_staging": 1.3766406429931521, "rewards/format_reward": 0.84375, "rewards/format_reward_staging": 0.9609375, "step": 161 }, { "clip_ratio": 0.0, "completion_length": 600.4375, "epoch": 161.8, "grad_norm": 1.229465314986415, "kl": 0.13531494140625, "learning_rate": 1.4848096202463372e-06, "loss": -0.0057, "reward": 16.17187523841858, "reward_std": 4.812445372343063, "rewards/accuracy_reward_staging": 1.4234375022351742, "rewards/format_reward": 0.9375, "rewards/format_reward_staging": 1.0, "step": 162 }, { "clip_ratio": 0.0, "completion_length": 582.125, "epoch": 162.8, "grad_norm": 1.3529028258154412, "kl": 0.1470947265625, "learning_rate": 1.4771587602596083e-06, "loss": 0.0891, "reward": 15.801562637090683, "reward_std": 4.80203927308321, "rewards/accuracy_reward_staging": 1.392656246200204, "rewards/format_reward": 0.90625, "rewards/format_reward_staging": 0.96875, "step": 163 }, { "clip_ratio": 0.0, "completion_length": 611.609375, "epoch": 163.8, "grad_norm": 1.4068895541908286, "kl": 0.15350341796875, "learning_rate": 1.4694715627858908e-06, "loss": 0.0591, "reward": 14.951562643051147, "reward_std": 5.22976279258728, "rewards/accuracy_reward_staging": 1.3045312520116568, "rewards/format_reward": 0.9375, "rewards/format_reward_staging": 0.96875, "step": 164 }, { "clip_ratio": 0.0, "completion_length": 609.671875, "epoch": 164.8, "grad_norm": 1.2658845630248547, "kl": 0.1463623046875, "learning_rate": 1.461748613235034e-06, "loss": 0.0266, "reward": 14.03125, "reward_std": 5.7937397211790085, "rewards/accuracy_reward_staging": 1.215625025331974, "rewards/format_reward": 0.921875, "rewards/format_reward_staging": 0.953125, "step": 165 }, { "clip_ratio": 0.0, "completion_length": 589.875, "epoch": 165.8, "grad_norm": 1.3344028785966706, "kl": 0.1558837890625, "learning_rate": 1.4539904997395467e-06, "loss": 0.0711, "reward": 15.390625029802322, "reward_std": 4.8061781376600266, "rewards/accuracy_reward_staging": 1.3515625335276127, "rewards/format_reward": 0.890625, "rewards/format_reward_staging": 0.984375, "step": 166 }, { "clip_ratio": 0.0, "completion_length": 607.0625, "epoch": 166.8, "grad_norm": 1.6122384935377623, "kl": 0.178619384765625, "learning_rate": 1.4461978131098087e-06, "loss": 0.0224, "reward": 12.757812649011612, "reward_std": 5.976896375417709, "rewards/accuracy_reward_staging": 1.0867187604308128, "rewards/format_reward": 0.921875, "rewards/format_reward_staging": 0.96875, "step": 167 }, { "clip_ratio": 0.0, "completion_length": 555.21875, "epoch": 167.8, "grad_norm": 1.3011797765950504, "kl": 0.15509033203125, "learning_rate": 1.4383711467890773e-06, "loss": -0.0096, "reward": 16.72499978542328, "reward_std": 4.908542029559612, "rewards/accuracy_reward_staging": 1.4756250157952309, "rewards/format_reward": 0.984375, "rewards/format_reward_staging": 0.984375, "step": 168 }, { "clip_ratio": 0.0, "completion_length": 619.546875, "epoch": 168.8, "grad_norm": 1.71128104516776, "kl": 0.15618896484375, "learning_rate": 1.430511096808295e-06, "loss": 0.1091, "reward": 15.999999672174454, "reward_std": 4.338721185922623, "rewards/accuracy_reward_staging": 1.4124999977648258, "rewards/format_reward": 0.90625, "rewards/format_reward_staging": 0.96875, "step": 169 }, { "epoch": 169.8, "grad_norm": 1.406915756546695, "learning_rate": 1.4226182617406994e-06, "loss": 0.0365, "step": 170 }, { "epoch": 169.8, "eval_clip_ratio": 0.0, "eval_completion_length": 599.3375, "eval_kl": 0.1652587890625, "eval_loss": 0.05737342685461044, "eval_reward": 15.347499918937682, "eval_reward_std": 5.264538067579269, "eval_rewards/accuracy_reward_staging": 1.346000000834465, "eval_rewards/format_reward": 0.925, "eval_rewards/format_reward_staging": 0.9625, "eval_runtime": 128.6979, "eval_samples_per_second": 0.155, "eval_steps_per_second": 0.039, "step": 170 }, { "clip_ratio": 0.0, "completion_length": 607.75, "epoch": 170.8, "grad_norm": 1.4282138543951368, "kl": 0.17828369140625, "learning_rate": 1.414693242656239e-06, "loss": -0.0169, "reward": 15.774218946695328, "reward_std": 5.630698639899492, "rewards/accuracy_reward_staging": 1.3899218812584877, "rewards/format_reward": 0.921875, "rewards/format_reward_staging": 0.953125, "step": 171 }, { "clip_ratio": 0.0, "completion_length": 597.484375, "epoch": 171.8, "grad_norm": 2.842026235946327, "kl": 0.22344970703125, "learning_rate": 1.4067366430758004e-06, "loss": 0.0117, "reward": 13.404687702655792, "reward_std": 5.611785896122456, "rewards/accuracy_reward_staging": 1.1482812836766243, "rewards/format_reward": 0.96875, "rewards/format_reward_staging": 0.953125, "step": 172 }, { "clip_ratio": 0.0, "completion_length": 593.34375, "epoch": 172.8, "grad_norm": 1.355496874052129, "kl": 0.178131103515625, "learning_rate": 1.3987490689252462e-06, "loss": 0.0242, "reward": 15.953125089406967, "reward_std": 4.999604664742947, "rewards/accuracy_reward_staging": 1.403124986216426, "rewards/format_reward": 0.921875, "rewards/format_reward_staging": 1.0, "step": 173 }, { "clip_ratio": 0.0, "completion_length": 636.546875, "epoch": 173.8, "grad_norm": 1.2708001429966154, "kl": 0.1480712890625, "learning_rate": 1.3907311284892735e-06, "loss": 0.0781, "reward": 16.860937863588333, "reward_std": 5.687329366803169, "rewards/accuracy_reward_staging": 1.4970312751829624, "rewards/format_reward": 0.921875, "rewards/format_reward_staging": 0.96875, "step": 174 }, { "clip_ratio": 0.0, "completion_length": 629.34375, "epoch": 174.8, "grad_norm": 1.2881497238833757, "kl": 0.184661865234375, "learning_rate": 1.3826834323650898e-06, "loss": 0.0465, "reward": 14.581250131130219, "reward_std": 5.882216438651085, "rewards/accuracy_reward_staging": 1.2831250075250864, "rewards/format_reward": 0.875, "rewards/format_reward_staging": 0.875, "step": 175 }, { "clip_ratio": 0.0, "completion_length": 636.1875, "epoch": 175.8, "grad_norm": 1.3087334876201022, "kl": 0.180084228515625, "learning_rate": 1.374606593415912e-06, "loss": 0.0515, "reward": 15.878125071525574, "reward_std": 5.341188468039036, "rewards/accuracy_reward_staging": 1.4081249758601189, "rewards/format_reward": 0.859375, "rewards/format_reward_staging": 0.9375, "step": 176 }, { "clip_ratio": 0.0, "completion_length": 620.109375, "epoch": 176.8, "grad_norm": 1.4376210080253358, "kl": 0.1929931640625, "learning_rate": 1.3665012267242972e-06, "loss": 0.0086, "reward": 13.495312541723251, "reward_std": 5.61242138594389, "rewards/accuracy_reward_staging": 1.1635937709361315, "rewards/format_reward": 0.921875, "rewards/format_reward_staging": 0.9375, "step": 177 }, { "clip_ratio": 0.0, "completion_length": 626.03125, "epoch": 177.8, "grad_norm": 1772489.589709048, "kl": 7520.199645996094, "learning_rate": 1.3583679495453e-06, "loss": 413.327, "reward": 14.464062601327896, "reward_std": 6.149559870362282, "rewards/accuracy_reward_staging": 1.2729687709361315, "rewards/format_reward": 0.859375, "rewards/format_reward_staging": 0.875, "step": 178 }, { "clip_ratio": 0.0, "completion_length": 622.34375, "epoch": 178.8, "grad_norm": 1.2715649881539406, "kl": 0.173065185546875, "learning_rate": 1.3502073812594674e-06, "loss": -0.0255, "reward": 16.45468744635582, "reward_std": 5.025842607021332, "rewards/accuracy_reward_staging": 1.4610937684774399, "rewards/format_reward": 0.875, "rewards/format_reward_staging": 0.96875, "step": 179 }, { "epoch": 179.8, "grad_norm": 1.3470338291497799, "learning_rate": 1.3420201433256689e-06, "loss": 0.0132, "step": 180 }, { "epoch": 179.8, "eval_clip_ratio": 0.0, "eval_completion_length": 644.8375, "eval_kl": 0.1693603515625, "eval_loss": 0.07019542157649994, "eval_reward": 15.515000081062317, "eval_reward_std": 4.510845869779587, "eval_rewards/accuracy_reward_staging": 1.3702499970793725, "eval_rewards/format_reward": 0.8625, "eval_rewards/format_reward_staging": 0.95, "eval_runtime": 140.9488, "eval_samples_per_second": 0.142, "eval_steps_per_second": 0.035, "step": 180 }, { "clip_ratio": 0.0, "completion_length": 602.9765625, "epoch": 180.8, "grad_norm": 1.3491479753013993, "kl": 0.183990478515625, "learning_rate": 1.3338068592337708e-06, "loss": 0.0611, "reward": 15.047656297683716, "reward_std": 5.386000510305166, "rewards/accuracy_reward_staging": 1.3149218782782555, "rewards/format_reward": 0.9453125, "rewards/format_reward_staging": 0.953125, "step": 181 }, { "clip_ratio": 0.0, "completion_length": 595.71875, "epoch": 181.8, "grad_norm": 1.3770681193765026, "kl": 0.19049072265625, "learning_rate": 1.3255681544571566e-06, "loss": 0.0247, "reward": 15.55312505364418, "reward_std": 5.63801646232605, "rewards/accuracy_reward_staging": 1.3615624941885471, "rewards/format_reward": 0.953125, "rewards/format_reward_staging": 0.984375, "step": 182 }, { "clip_ratio": 0.0, "completion_length": 615.59375, "epoch": 182.8, "grad_norm": 1.4258618367759082, "kl": 0.18341064453125, "learning_rate": 1.3173046564050923e-06, "loss": 0.0382, "reward": 15.492187589406967, "reward_std": 5.1280196234583855, "rewards/accuracy_reward_staging": 1.3664062581956387, "rewards/format_reward": 0.890625, "rewards/format_reward_staging": 0.9375, "step": 183 }, { "clip_ratio": 0.0, "completion_length": 657.21875, "epoch": 183.8, "grad_norm": 1.4496706627719373, "kl": 0.2099609375, "learning_rate": 1.3090169943749473e-06, "loss": -0.0093, "reward": 15.481249988079071, "reward_std": 4.195666573941708, "rewards/accuracy_reward_staging": 1.360624998807907, "rewards/format_reward": 0.90625, "rewards/format_reward_staging": 0.96875, "step": 184 }, { "clip_ratio": 0.0, "completion_length": 643.625, "epoch": 184.8, "grad_norm": 12.416744832467893, "kl": 0.35125732421875, "learning_rate": 1.3007057995042729e-06, "loss": 0.0552, "reward": 16.317187398672104, "reward_std": 5.490992607548833, "rewards/accuracy_reward_staging": 1.4489062502980232, "rewards/format_reward": 0.859375, "rewards/format_reward_staging": 0.96875, "step": 185 }, { "clip_ratio": 0.0, "completion_length": 623.515625, "epoch": 185.8, "grad_norm": 1.3589509266087427, "kl": 0.20068359375, "learning_rate": 1.2923717047227368e-06, "loss": 0.0935, "reward": 13.135937482118607, "reward_std": 5.502887517213821, "rewards/accuracy_reward_staging": 1.1276562362909317, "rewards/format_reward": 0.90625, "rewards/format_reward_staging": 0.953125, "step": 186 }, { "clip_ratio": 0.0, "completion_length": 617.5625, "epoch": 186.8, "grad_norm": 5.614254453009779, "kl": 0.2403564453125, "learning_rate": 1.2840153447039228e-06, "loss": 0.0561, "reward": 14.834374755620956, "reward_std": 6.0779377073049545, "rewards/accuracy_reward_staging": 1.2990624997764826, "rewards/format_reward": 0.890625, "rewards/format_reward_staging": 0.953125, "step": 187 }, { "clip_ratio": 0.0, "completion_length": 636.03125, "epoch": 187.8, "grad_norm": 1.3627377674239467, "kl": 0.1864013671875, "learning_rate": 1.275637355816999e-06, "loss": 0.078, "reward": 13.956250250339508, "reward_std": 5.885370120406151, "rewards/accuracy_reward_staging": 1.2112500164657831, "rewards/format_reward": 0.921875, "rewards/format_reward_staging": 0.921875, "step": 188 }, { "clip_ratio": 0.0, "completion_length": 605.875, "epoch": 188.8, "grad_norm": 1.4102127840998477, "kl": 0.1846923828125, "learning_rate": 1.2672383760782567e-06, "loss": 0.0346, "reward": 14.612500011920929, "reward_std": 6.431307382881641, "rewards/accuracy_reward_staging": 1.2737499997019768, "rewards/format_reward": 0.921875, "rewards/format_reward_staging": 0.953125, "step": 189 }, { "epoch": 189.8, "grad_norm": 1.2657153918632975, "learning_rate": 1.2588190451025207e-06, "loss": 0.0905, "step": 190 }, { "epoch": 189.8, "eval_clip_ratio": 0.0, "eval_completion_length": 639.3875, "eval_kl": 0.183544921875, "eval_loss": 0.09273408353328705, "eval_reward": 14.913750052452087, "eval_reward_std": 4.527625149488449, "eval_rewards/accuracy_reward_staging": 1.312625017762184, "eval_rewards/format_reward": 0.875, "eval_rewards/format_reward_staging": 0.9125, "eval_runtime": 149.5357, "eval_samples_per_second": 0.134, "eval_steps_per_second": 0.033, "step": 190 }, { "clip_ratio": 0.0, "completion_length": 637.03125, "epoch": 190.8, "grad_norm": 1.5561907060681732, "kl": 0.20361328125, "learning_rate": 1.2503800040544414e-06, "loss": 0.027, "reward": 13.858593851327896, "reward_std": 5.260060213506222, "rewards/accuracy_reward_staging": 1.199921895749867, "rewards/format_reward": 0.9140625, "rewards/format_reward_staging": 0.9453125, "step": 191 }, { "clip_ratio": 0.0, "completion_length": 614.578125, "epoch": 191.8, "grad_norm": 1.4112962880914386, "kl": 0.173828125, "learning_rate": 1.2419218955996676e-06, "loss": 0.0162, "reward": 15.854687571525574, "reward_std": 4.764394700527191, "rewards/accuracy_reward_staging": 1.3979687802493572, "rewards/format_reward": 0.90625, "rewards/format_reward_staging": 0.96875, "step": 192 }, { "clip_ratio": 0.0, "completion_length": 628.078125, "epoch": 192.8, "grad_norm": 1.329564996171061, "kl": 0.178253173828125, "learning_rate": 1.2334453638559054e-06, "loss": 0.0255, "reward": 13.543750017881393, "reward_std": 5.472193785011768, "rewards/accuracy_reward_staging": 1.168437510728836, "rewards/format_reward": 0.921875, "rewards/format_reward_staging": 0.9375, "step": 193 }, { "clip_ratio": 0.0, "completion_length": 657.796875, "epoch": 193.8, "grad_norm": 1.8007713245581922, "kl": 0.198944091796875, "learning_rate": 1.2249510543438651e-06, "loss": 0.0516, "reward": 14.325000017881393, "reward_std": 4.515300907194614, "rewards/accuracy_reward_staging": 1.2465624995529652, "rewards/format_reward": 0.90625, "rewards/format_reward_staging": 0.953125, "step": 194 }, { "clip_ratio": 0.0, "completion_length": 623.984375, "epoch": 194.8, "grad_norm": 1.2974682020385033, "kl": 0.1806640625, "learning_rate": 1.2164396139381029e-06, "loss": 0.0383, "reward": 14.945312559604645, "reward_std": 5.147151567041874, "rewards/accuracy_reward_staging": 1.308593761175871, "rewards/format_reward": 0.921875, "rewards/format_reward_staging": 0.9375, "step": 195 }, { "clip_ratio": 0.0, "completion_length": 581.5625, "epoch": 195.8, "grad_norm": 1.260278322445417, "kl": 0.18658447265625, "learning_rate": 1.207911690817759e-06, "loss": 0.0477, "reward": 14.231250047683716, "reward_std": 5.530913561582565, "rewards/accuracy_reward_staging": 1.2418750207871199, "rewards/format_reward": 0.875, "rewards/format_reward_staging": 0.9375, "step": 196 }, { "clip_ratio": 0.0, "completion_length": 604.90625, "epoch": 196.8, "grad_norm": 1.3096099803251504, "kl": 0.1900634765625, "learning_rate": 1.1993679344171972e-06, "loss": 0.0251, "reward": 15.50156244635582, "reward_std": 5.51077751070261, "rewards/accuracy_reward_staging": 1.3610937595367432, "rewards/format_reward": 0.96875, "rewards/format_reward_staging": 0.921875, "step": 197 }, { "clip_ratio": 0.0, "completion_length": 605.453125, "epoch": 197.8, "grad_norm": 1.3586492398815606, "kl": 0.189208984375, "learning_rate": 1.1908089953765447e-06, "loss": 0.0523, "reward": 13.85781279206276, "reward_std": 4.988245405256748, "rewards/accuracy_reward_staging": 1.1998437773436308, "rewards/format_reward": 0.90625, "rewards/format_reward_staging": 0.953125, "step": 198 }, { "clip_ratio": 0.0, "completion_length": 635.609375, "epoch": 198.8, "grad_norm": 1.340689220119322, "kl": 0.17425537109375, "learning_rate": 1.1822355254921476e-06, "loss": 0.042, "reward": 15.098437517881393, "reward_std": 4.625658318400383, "rewards/accuracy_reward_staging": 1.3145312629640102, "rewards/format_reward": 0.984375, "rewards/format_reward_staging": 0.96875, "step": 199 }, { "epoch": 199.8, "grad_norm": 1.38676313511267, "learning_rate": 1.1736481776669305e-06, "loss": 0.0078, "step": 200 }, { "epoch": 199.8, "eval_clip_ratio": 0.0, "eval_completion_length": 617.9625, "eval_kl": 0.227685546875, "eval_loss": -0.011267063207924366, "eval_reward": 15.479999876022339, "eval_reward_std": 5.917063271999359, "eval_rewards/accuracy_reward_staging": 1.358000010251999, "eval_rewards/format_reward": 0.95, "eval_rewards/format_reward_staging": 0.95, "eval_runtime": 140.7854, "eval_samples_per_second": 0.142, "eval_steps_per_second": 0.036, "step": 200 }, { "clip_ratio": 0.0, "completion_length": 619.3828125, "epoch": 200.8, "grad_norm": 1.2618291004150899, "kl": 0.187408447265625, "learning_rate": 1.1650476058606774e-06, "loss": 0.0638, "reward": 14.914843901991844, "reward_std": 5.3753106370568275, "rewards/accuracy_reward_staging": 1.3063281308859587, "rewards/format_reward": 0.890625, "rewards/format_reward_staging": 0.9609375, "step": 201 }, { "clip_ratio": 0.0, "completion_length": 661.421875, "epoch": 201.8, "grad_norm": 1.3037648155909478, "kl": 0.1773681640625, "learning_rate": 1.156434465040231e-06, "loss": 0.089, "reward": 14.482812568545341, "reward_std": 5.621522009372711, "rewards/accuracy_reward_staging": 1.263906242325902, "rewards/format_reward": 0.9375, "rewards/format_reward_staging": 0.90625, "step": 202 }, { "clip_ratio": 0.0, "completion_length": 634.109375, "epoch": 202.8, "grad_norm": 7.276649221582544, "kl": 0.24066162109375, "learning_rate": 1.1478094111296109e-06, "loss": 0.0672, "reward": 14.378125011920929, "reward_std": 5.686573512852192, "rewards/accuracy_reward_staging": 1.2518749982118607, "rewards/format_reward": 0.90625, "rewards/format_reward_staging": 0.953125, "step": 203 }, { "clip_ratio": 0.0, "completion_length": 630.71875, "epoch": 203.8, "grad_norm": 1.231495456965766, "kl": 0.2060546875, "learning_rate": 1.1391731009600653e-06, "loss": 0.1112, "reward": 14.843750059604645, "reward_std": 4.25174543261528, "rewards/accuracy_reward_staging": 1.298437513411045, "rewards/format_reward": 0.90625, "rewards/format_reward_staging": 0.953125, "step": 204 }, { "clip_ratio": 0.0, "completion_length": 692.5625, "epoch": 204.8, "grad_norm": 1.350931922850144, "kl": 0.224212646484375, "learning_rate": 1.1305261922200517e-06, "loss": 0.0238, "reward": 15.984374910593033, "reward_std": 5.367278844118118, "rewards/accuracy_reward_staging": 1.4234375320374966, "rewards/format_reward": 0.8125, "rewards/format_reward_staging": 0.9375, "step": 205 }, { "clip_ratio": 0.0, "completion_length": 690.828125, "epoch": 205.8, "grad_norm": 1.318954712841193, "kl": 0.19384765625, "learning_rate": 1.1218693434051474e-06, "loss": 0.0885, "reward": 16.017187118530273, "reward_std": 5.8362889885902405, "rewards/accuracy_reward_staging": 1.41890624538064, "rewards/format_reward": 0.90625, "rewards/format_reward_staging": 0.921875, "step": 206 }, { "clip_ratio": 0.0, "completion_length": 632.796875, "epoch": 206.8, "grad_norm": 1.4124473460728744, "kl": 0.241455078125, "learning_rate": 1.1132032137679068e-06, "loss": 0.0533, "reward": 16.976562440395355, "reward_std": 5.929606184363365, "rewards/accuracy_reward_staging": 1.5070312470197678, "rewards/format_reward": 0.9375, "rewards/format_reward_staging": 0.96875, "step": 207 }, { "clip_ratio": 0.0, "completion_length": 646.421875, "epoch": 207.8, "grad_norm": 1.6773375733949765, "kl": 0.25189208984375, "learning_rate": 1.1045284632676535e-06, "loss": 0.0348, "reward": 14.135937511920929, "reward_std": 4.714122384786606, "rewards/accuracy_reward_staging": 1.2307812497019768, "rewards/format_reward": 0.890625, "rewards/format_reward_staging": 0.9375, "step": 208 }, { "clip_ratio": 0.0, "completion_length": 617.140625, "epoch": 208.8, "grad_norm": 1.4348854069875467, "kl": 0.22174072265625, "learning_rate": 1.095845752520224e-06, "loss": 0.0495, "reward": 14.285937666893005, "reward_std": 5.9770321398973465, "rewards/accuracy_reward_staging": 1.2473437581211329, "rewards/format_reward": 0.921875, "rewards/format_reward_staging": 0.890625, "step": 209 }, { "epoch": 209.8, "grad_norm": 1.1605399733788089, "learning_rate": 1.0871557427476583e-06, "loss": 0.0552, "step": 210 }, { "epoch": 209.8, "eval_clip_ratio": 0.0, "eval_completion_length": 655.7875, "eval_kl": 0.201123046875, "eval_loss": -0.0035779415629804134, "eval_reward": 16.239999866485597, "eval_reward_std": 4.2823482871055605, "eval_rewards/accuracy_reward_staging": 1.4415000066161157, "eval_rewards/format_reward": 0.9, "eval_rewards/format_reward_staging": 0.925, "eval_runtime": 142.3261, "eval_samples_per_second": 0.141, "eval_steps_per_second": 0.035, "step": 210 }, { "clip_ratio": 0.0, "completion_length": 646.625, "epoch": 210.8, "grad_norm": 1.3085478801780013, "kl": 0.21307373046875, "learning_rate": 1.078459095727845e-06, "loss": 0.0801, "reward": 14.151562541723251, "reward_std": 4.665051084011793, "rewards/accuracy_reward_staging": 1.2393749924376607, "rewards/format_reward": 0.84375, "rewards/format_reward_staging": 0.9140625, "step": 211 }, { "clip_ratio": 0.0, "completion_length": 627.609375, "epoch": 211.8, "grad_norm": 1.3675217879957127, "kl": 0.22113037109375, "learning_rate": 1.069756473744125e-06, "loss": 0.0458, "reward": 15.82500010728836, "reward_std": 5.255921743810177, "rewards/accuracy_reward_staging": 1.4075000062584877, "rewards/format_reward": 0.8125, "rewards/format_reward_staging": 0.9375, "step": 212 }, { "clip_ratio": 0.0, "completion_length": 617.390625, "epoch": 212.8, "grad_norm": 1.380807253404178, "kl": 0.20758056640625, "learning_rate": 1.061048539534857e-06, "loss": 0.0209, "reward": 15.371874898672104, "reward_std": 6.567200765013695, "rewards/accuracy_reward_staging": 1.3559375293552876, "rewards/format_reward": 0.890625, "rewards/format_reward_staging": 0.921875, "step": 213 }, { "clip_ratio": 0.0, "completion_length": 620.953125, "epoch": 213.8, "grad_norm": 1.3902160353507123, "kl": 0.21124267578125, "learning_rate": 1.052335956242944e-06, "loss": 0.0436, "reward": 15.621874839067459, "reward_std": 5.122038297355175, "rewards/accuracy_reward_staging": 1.382499996572733, "rewards/format_reward": 0.84375, "rewards/format_reward_staging": 0.953125, "step": 214 }, { "clip_ratio": 0.0, "completion_length": 655.359375, "epoch": 214.8, "grad_norm": 1.3806424551742382, "kl": 0.22705078125, "learning_rate": 1.043619387365336e-06, "loss": -0.0022, "reward": 13.870312362909317, "reward_std": 5.269171215593815, "rewards/accuracy_reward_staging": 1.2057812418788671, "rewards/format_reward": 0.90625, "rewards/format_reward_staging": 0.90625, "step": 215 }, { "clip_ratio": 0.0, "completion_length": 655.09375, "epoch": 215.8, "grad_norm": 2.656219707386371, "kl": 0.2349853515625, "learning_rate": 1.034899496702501e-06, "loss": 0.0562, "reward": 14.140625029802322, "reward_std": 5.58522791415453, "rewards/accuracy_reward_staging": 1.2312500067055225, "rewards/format_reward": 0.890625, "rewards/format_reward_staging": 0.9375, "step": 216 }, { "clip_ratio": 0.0, "completion_length": 622.8125, "epoch": 216.8, "grad_norm": 1.5209208648591772, "kl": 0.20660400390625, "learning_rate": 1.0261769483078732e-06, "loss": 0.0816, "reward": 14.775000303983688, "reward_std": 4.954350218176842, "rewards/accuracy_reward_staging": 1.2931250091642141, "rewards/format_reward": 0.875, "rewards/format_reward_staging": 0.96875, "step": 217 }, { "clip_ratio": 0.0, "completion_length": 650.234375, "epoch": 217.8, "grad_norm": 1.55208660193829, "kl": 0.210693359375, "learning_rate": 1.0174524064372837e-06, "loss": 0.0744, "reward": 14.27812522649765, "reward_std": 4.545742444694042, "rewards/accuracy_reward_staging": 1.2575000151991844, "rewards/format_reward": 0.78125, "rewards/format_reward_staging": 0.921875, "step": 218 }, { "clip_ratio": 0.0, "completion_length": 617.234375, "epoch": 218.8, "grad_norm": 1.3928599706734481, "kl": 0.20306396484375, "learning_rate": 1.0087265354983738e-06, "loss": 0.0865, "reward": 14.462500095367432, "reward_std": 5.321807600557804, "rewards/accuracy_reward_staging": 1.2571874894201756, "rewards/format_reward": 0.90625, "rewards/format_reward_staging": 0.984375, "step": 219 }, { "epoch": 219.8, "grad_norm": 1.5191361705497182, "learning_rate": 1e-06, "loss": 0.0929, "step": 220 }, { "epoch": 219.8, "eval_clip_ratio": 0.0, "eval_completion_length": 621.85, "eval_kl": 0.18828125, "eval_loss": 0.07175321877002716, "eval_reward": 15.787499904632568, "eval_reward_std": 4.991747093200684, "eval_rewards/accuracy_reward_staging": 1.39125002771616, "eval_rewards/format_reward": 0.925, "eval_rewards/format_reward_staging": 0.95, "eval_runtime": 143.7256, "eval_samples_per_second": 0.139, "eval_steps_per_second": 0.035, "step": 220 }, { "clip_ratio": 0.0, "completion_length": 626.6953125, "epoch": 220.8, "grad_norm": 1.1979189103689327, "kl": 0.20904541015625, "learning_rate": 9.912734645016263e-07, "loss": 0.0653, "reward": 14.830468773841858, "reward_std": 4.90237557888031, "rewards/accuracy_reward_staging": 1.3041406441479921, "rewards/format_reward": 0.8515625, "rewards/format_reward_staging": 0.9375, "step": 221 }, { "clip_ratio": 0.0, "completion_length": 618.171875, "epoch": 221.8, "grad_norm": 1.3245626778227455, "kl": 0.19866943359375, "learning_rate": 9.825475935627165e-07, "loss": 0.0378, "reward": 15.185937464237213, "reward_std": 6.512602657079697, "rewards/accuracy_reward_staging": 1.3451562821865082, "rewards/format_reward": 0.8125, "rewards/format_reward_staging": 0.921875, "step": 222 }, { "clip_ratio": 0.0, "completion_length": 655.546875, "epoch": 222.8, "grad_norm": 1.2627576071000894, "kl": 0.22076416015625, "learning_rate": 9.73823051692127e-07, "loss": 0.0823, "reward": 14.023437589406967, "reward_std": 5.150766499340534, "rewards/accuracy_reward_staging": 1.2210937440395355, "rewards/format_reward": 0.875, "rewards/format_reward_staging": 0.9375, "step": 223 }, { "clip_ratio": 0.0, "completion_length": 630.046875, "epoch": 223.8, "grad_norm": 1.4630069125653442, "kl": 0.22393798828125, "learning_rate": 9.651005032974993e-07, "loss": 0.1163, "reward": 15.462500154972076, "reward_std": 4.448259741067886, "rewards/accuracy_reward_staging": 1.3556250110268593, "rewards/format_reward": 0.921875, "rewards/format_reward_staging": 0.984375, "step": 224 }, { "clip_ratio": 0.0, "completion_length": 626.4375, "epoch": 224.8, "grad_norm": 1.3737324086080591, "kl": 0.24346923828125, "learning_rate": 9.56380612634664e-07, "loss": 0.073, "reward": 13.575000047683716, "reward_std": 5.982485473155975, "rewards/accuracy_reward_staging": 1.174687497317791, "rewards/format_reward": 0.890625, "rewards/format_reward_staging": 0.9375, "step": 225 }, { "clip_ratio": 0.0, "completion_length": 696.59375, "epoch": 225.8, "grad_norm": 1.4368525693959429, "kl": 0.2528076171875, "learning_rate": 9.476640437570561e-07, "loss": 0.0855, "reward": 14.559374749660492, "reward_std": 6.478231497108936, "rewards/accuracy_reward_staging": 1.2887500040233135, "rewards/format_reward": 0.78125, "rewards/format_reward_staging": 0.890625, "step": 226 }, { "clip_ratio": 0.0, "completion_length": 655.390625, "epoch": 226.8, "grad_norm": 1.3423833107341736, "kl": 0.2432861328125, "learning_rate": 9.38951460465143e-07, "loss": 0.0918, "reward": 13.806249856948853, "reward_std": 5.268295802175999, "rewards/accuracy_reward_staging": 1.2056250125169754, "rewards/format_reward": 0.828125, "rewards/format_reward_staging": 0.921875, "step": 227 }, { "clip_ratio": 0.0, "completion_length": 720.40625, "epoch": 227.8, "grad_norm": 2.0887623964966555, "kl": 0.26153564453125, "learning_rate": 9.302435262558747e-07, "loss": 0.0741, "reward": 14.531249985098839, "reward_std": 6.193335264921188, "rewards/accuracy_reward_staging": 1.2843750081956387, "rewards/format_reward": 0.84375, "rewards/format_reward_staging": 0.84375, "step": 228 }, { "clip_ratio": 0.0, "completion_length": 699.34375, "epoch": 228.8, "grad_norm": 1.5817414267449967, "kl": 0.25372314453125, "learning_rate": 9.215409042721551e-07, "loss": 0.1477, "reward": 14.853124976158142, "reward_std": 7.3317131996154785, "rewards/accuracy_reward_staging": 1.3150000125169754, "rewards/format_reward": 0.8125, "rewards/format_reward_staging": 0.890625, "step": 229 }, { "epoch": 229.8, "grad_norm": 1.4707093666889888, "learning_rate": 9.128442572523417e-07, "loss": 0.1039, "step": 230 }, { "epoch": 229.8, "eval_clip_ratio": 0.0, "eval_completion_length": 699.725, "eval_kl": 0.27763671875, "eval_loss": 0.13621756434440613, "eval_reward": 15.102499961853027, "eval_reward_std": 5.6639987349510195, "eval_rewards/accuracy_reward_staging": 1.337749996781349, "eval_rewards/format_reward": 0.825, "eval_rewards/format_reward_staging": 0.9, "eval_runtime": 176.4013, "eval_samples_per_second": 0.113, "eval_steps_per_second": 0.028, "step": 230 }, { "clip_ratio": 0.0, "completion_length": 750.5234375, "epoch": 230.8, "grad_norm": 2.5022162810953015, "kl": 0.310821533203125, "learning_rate": 9.04154247479776e-07, "loss": 0.0621, "reward": 13.921874970197678, "reward_std": 5.641379028558731, "rewards/accuracy_reward_staging": 1.2312500048428774, "rewards/format_reward": 0.7421875, "rewards/format_reward_staging": 0.8671875, "step": 231 }, { "clip_ratio": 0.0, "completion_length": 737.5, "epoch": 231.8, "grad_norm": 1.3203764343240048, "kl": 0.2919921875, "learning_rate": 8.954715367323466e-07, "loss": 0.1254, "reward": 13.312500059604645, "reward_std": 5.994187116622925, "rewards/accuracy_reward_staging": 1.1750000100582838, "rewards/format_reward": 0.703125, "rewards/format_reward_staging": 0.859375, "step": 232 }, { "clip_ratio": 0.0, "completion_length": 663.015625, "epoch": 232.8, "grad_norm": 1.6833018973839629, "kl": 0.306640625, "learning_rate": 8.867967862320933e-07, "loss": 0.0829, "reward": 11.451562643051147, "reward_std": 6.721396386623383, "rewards/accuracy_reward_staging": 0.9779687598347664, "rewards/format_reward": 0.875, "rewards/format_reward_staging": 0.796875, "step": 233 }, { "clip_ratio": 0.0, "completion_length": 745.671875, "epoch": 233.8, "grad_norm": 1.5784019252688062, "kl": 0.3076171875, "learning_rate": 8.781306565948526e-07, "loss": 0.0822, "reward": 13.026562601327896, "reward_std": 4.835877507925034, "rewards/accuracy_reward_staging": 1.143281283788383, "rewards/format_reward": 0.78125, "rewards/format_reward_staging": 0.8125, "step": 234 }, { "clip_ratio": 0.0, "completion_length": 716.4375, "epoch": 234.8, "grad_norm": 1.5923082293463926, "kl": 0.3375244140625, "learning_rate": 8.694738077799486e-07, "loss": 0.0811, "reward": 13.98749989271164, "reward_std": 7.312740258872509, "rewards/accuracy_reward_staging": 1.2425000295042992, "rewards/format_reward": 0.796875, "rewards/format_reward_staging": 0.765625, "step": 235 }, { "clip_ratio": 0.0, "completion_length": 728.125, "epoch": 235.8, "grad_norm": 1.520271972538422, "kl": 0.3232421875, "learning_rate": 8.608268990399348e-07, "loss": 0.1051, "reward": 13.317187458276749, "reward_std": 7.371540606021881, "rewards/accuracy_reward_staging": 1.1739062573760748, "rewards/format_reward": 0.71875, "rewards/format_reward_staging": 0.859375, "step": 236 }, { "clip_ratio": 0.0, "completion_length": 804.15625, "epoch": 236.8, "grad_norm": 1.3584766642039745, "kl": 0.3104248046875, "learning_rate": 8.521905888703893e-07, "loss": 0.1753, "reward": 12.36562493443489, "reward_std": 7.354695707559586, "rewards/accuracy_reward_staging": 1.0818749964237213, "rewards/format_reward": 0.78125, "rewards/format_reward_staging": 0.765625, "step": 237 }, { "clip_ratio": 0.0, "completion_length": 691.6875, "epoch": 237.8, "grad_norm": 1.4062373976771998, "kl": 0.3116455078125, "learning_rate": 8.435655349597689e-07, "loss": 0.1024, "reward": 13.564062476158142, "reward_std": 6.342557780444622, "rewards/accuracy_reward_staging": 1.2001562491059303, "rewards/format_reward": 0.796875, "rewards/format_reward_staging": 0.765625, "step": 238 }, { "clip_ratio": 0.0, "completion_length": 825.25, "epoch": 238.8, "grad_norm": 1.3912617565210863, "kl": 0.326416015625, "learning_rate": 8.349523941393223e-07, "loss": 0.1304, "reward": 13.695312589406967, "reward_std": 7.123799741268158, "rewards/accuracy_reward_staging": 1.211718775331974, "rewards/format_reward": 0.78125, "rewards/format_reward_staging": 0.796875, "step": 239 }, { "epoch": 239.8, "grad_norm": 1.5595564499799097, "learning_rate": 8.263518223330696e-07, "loss": 0.2716, "step": 240 }, { "epoch": 239.8, "eval_clip_ratio": 0.0, "eval_completion_length": 710.4875, "eval_kl": 0.301416015625, "eval_loss": 0.1302235871553421, "eval_reward": 13.53874992132187, "eval_reward_std": 5.90446172952652, "eval_rewards/accuracy_reward_staging": 1.1926250100135802, "eval_rewards/format_reward": 0.825, "eval_rewards/format_reward_staging": 0.7875, "eval_runtime": 172.1092, "eval_samples_per_second": 0.116, "eval_steps_per_second": 0.029, "step": 240 }, { "clip_ratio": 0.0, "completion_length": 820.21875, "epoch": 240.8, "grad_norm": 1.3594222983218733, "kl": 0.29791259765625, "learning_rate": 8.177644745078525e-07, "loss": 0.1216, "reward": 13.201562486588955, "reward_std": 6.912581101059914, "rewards/accuracy_reward_staging": 1.1646875077858567, "rewards/format_reward": 0.7734375, "rewards/format_reward_staging": 0.78125, "step": 241 }, { "clip_ratio": 0.0, "completion_length": 714.53125, "epoch": 241.8, "grad_norm": 1.3736471708376299, "kl": 0.31427001953125, "learning_rate": 8.091910046234551e-07, "loss": 0.1539, "reward": 14.295312345027924, "reward_std": 5.410611517727375, "rewards/accuracy_reward_staging": 1.2639062507078052, "rewards/format_reward": 0.84375, "rewards/format_reward_staging": 0.8125, "step": 242 }, { "clip_ratio": 0.0, "completion_length": 731.6875, "epoch": 242.8, "grad_norm": 1.4519054784859347, "kl": 0.33892822265625, "learning_rate": 8.006320655828029e-07, "loss": 0.1532, "reward": 11.943750023841858, "reward_std": 7.561622552573681, "rewards/accuracy_reward_staging": 1.0662500150501728, "rewards/format_reward": 0.640625, "rewards/format_reward_staging": 0.640625, "step": 243 }, { "clip_ratio": 0.0, "completion_length": 807.59375, "epoch": 243.8, "grad_norm": 1.4438718122124043, "kl": 0.340576171875, "learning_rate": 7.920883091822408e-07, "loss": 0.1606, "reward": 10.796874925494194, "reward_std": 7.172753885388374, "rewards/accuracy_reward_staging": 0.9265625104308128, "rewards/format_reward": 0.8125, "rewards/format_reward_staging": 0.71875, "step": 244 }, { "clip_ratio": 0.0, "completion_length": 755.296875, "epoch": 244.8, "grad_norm": 1.5583845174659146, "kl": 0.3447265625, "learning_rate": 7.835603860618971e-07, "loss": 0.2097, "reward": 11.843750149011612, "reward_std": 7.400421276688576, "rewards/accuracy_reward_staging": 1.0296875108033419, "rewards/format_reward": 0.796875, "rewards/format_reward_staging": 0.75, "step": 245 }, { "clip_ratio": 0.0, "completion_length": 759.03125, "epoch": 245.8, "grad_norm": 1.3528957695912824, "kl": 0.357666015625, "learning_rate": 7.750489456561351e-07, "loss": 0.1164, "reward": 11.779687464237213, "reward_std": 6.353302523493767, "rewards/accuracy_reward_staging": 1.0295312507078052, "rewards/format_reward": 0.765625, "rewards/format_reward_staging": 0.71875, "step": 246 }, { "clip_ratio": 0.0, "completion_length": 848.734375, "epoch": 246.8, "grad_norm": 1.554460288339238, "kl": 0.3973388671875, "learning_rate": 7.665546361440949e-07, "loss": 0.1267, "reward": 12.23906247317791, "reward_std": 7.234898805618286, "rewards/accuracy_reward_staging": 1.0817187651991844, "rewards/format_reward": 0.703125, "rewards/format_reward_staging": 0.71875, "step": 247 }, { "clip_ratio": 0.0, "completion_length": 822.703125, "epoch": 247.8, "grad_norm": 1.433844235366676, "kl": 0.3592529296875, "learning_rate": 7.580781044003324e-07, "loss": 0.0235, "reward": 14.143749997019768, "reward_std": 5.741221696138382, "rewards/accuracy_reward_staging": 1.2581250127404928, "rewards/format_reward": 0.765625, "rewards/format_reward_staging": 0.796875, "step": 248 }, { "clip_ratio": 0.0, "completion_length": 903.109375, "epoch": 248.8, "grad_norm": 1.3566149279604043, "kl": 0.31982421875, "learning_rate": 7.496199959455583e-07, "loss": 0.2022, "reward": 11.91718764603138, "reward_std": 6.288423582911491, "rewards/accuracy_reward_staging": 1.0448437514714897, "rewards/format_reward": 0.71875, "rewards/format_reward_staging": 0.75, "step": 249 }, { "epoch": 249.8, "grad_norm": 3.5268979600947743, "learning_rate": 7.411809548974791e-07, "loss": 0.2181, "step": 250 }, { "epoch": 249.8, "eval_clip_ratio": 0.0, "eval_completion_length": 863.7875, "eval_kl": 0.37177734375, "eval_loss": 0.22468861937522888, "eval_reward": 12.728750014305115, "eval_reward_std": 6.379789352416992, "eval_rewards/accuracy_reward_staging": 1.1216249838471413, "eval_rewards/format_reward": 0.775, "eval_rewards/format_reward_staging": 0.7375, "eval_runtime": 243.2396, "eval_samples_per_second": 0.082, "eval_steps_per_second": 0.021, "step": 250 }, { "clip_ratio": 0.0, "completion_length": 816.5703125, "epoch": 250.8, "grad_norm": 1.3710303728682416, "kl": 0.4068603515625, "learning_rate": 7.327616239217431e-07, "loss": 0.176, "reward": 12.385937452316284, "reward_std": 6.818997707217932, "rewards/accuracy_reward_staging": 1.0815625092945993, "rewards/format_reward": 0.796875, "rewards/format_reward_staging": 0.7734375, "step": 251 }, { "clip_ratio": 0.0, "completion_length": 811.671875, "epoch": 251.8, "grad_norm": 1.361752204741976, "kl": 0.39990234375, "learning_rate": 7.243626441830009e-07, "loss": 0.1261, "reward": 11.423437476158142, "reward_std": 7.801801845431328, "rewards/accuracy_reward_staging": 0.9985937401652336, "rewards/format_reward": 0.78125, "rewards/format_reward_staging": 0.65625, "step": 252 }, { "clip_ratio": 0.0, "completion_length": 796.375, "epoch": 252.8, "grad_norm": 1.4560468845981351, "kl": 0.3995361328125, "learning_rate": 7.159846552960773e-07, "loss": 0.1045, "reward": 12.606250077486038, "reward_std": 7.680657230317593, "rewards/accuracy_reward_staging": 1.107500022277236, "rewards/format_reward": 0.8125, "rewards/format_reward_staging": 0.71875, "step": 253 }, { "clip_ratio": 0.0, "completion_length": 951.03125, "epoch": 253.8, "grad_norm": 1.411783783971026, "kl": 0.36572265625, "learning_rate": 7.076282952772633e-07, "loss": 0.2697, "reward": 10.959374994039536, "reward_std": 8.183534801006317, "rewards/accuracy_reward_staging": 0.9631250090897083, "rewards/format_reward": 0.75, "rewards/format_reward_staging": 0.578125, "step": 254 }, { "clip_ratio": 0.0, "completion_length": 828.109375, "epoch": 254.8, "grad_norm": 1.573986411740295, "kl": 0.385498046875, "learning_rate": 6.992942004957269e-07, "loss": 0.2294, "reward": 11.017187476158142, "reward_std": 8.27015207707882, "rewards/accuracy_reward_staging": 0.9517187681049109, "rewards/format_reward": 0.78125, "rewards/format_reward_staging": 0.71875, "step": 255 }, { "clip_ratio": 0.0, "completion_length": 896.015625, "epoch": 255.8, "grad_norm": 1.5503448731327603, "kl": 0.4189453125, "learning_rate": 6.909830056250526e-07, "loss": 0.233, "reward": 12.285937294363976, "reward_std": 7.363780289888382, "rewards/accuracy_reward_staging": 1.081718772649765, "rewards/format_reward": 0.75, "rewards/format_reward_staging": 0.71875, "step": 256 }, { "clip_ratio": 0.0, "completion_length": 891.390625, "epoch": 256.8, "grad_norm": 1.5949891985949471, "kl": 0.38238525390625, "learning_rate": 6.82695343594908e-07, "loss": 0.2359, "reward": 13.309375122189522, "reward_std": 7.619817182421684, "rewards/accuracy_reward_staging": 1.1700000204145908, "rewards/format_reward": 0.875, "rewards/format_reward_staging": 0.734375, "step": 257 }, { "clip_ratio": 0.0, "completion_length": 892.046875, "epoch": 257.8, "grad_norm": 1.6964810618377213, "kl": 0.375, "learning_rate": 6.744318455428435e-07, "loss": 0.3474, "reward": 11.334375008940697, "reward_std": 7.634813725948334, "rewards/accuracy_reward_staging": 0.9834375139325857, "rewards/format_reward": 0.8125, "rewards/format_reward_staging": 0.6875, "step": 258 }, { "clip_ratio": 0.0, "completion_length": 723.828125, "epoch": 258.8, "grad_norm": 5.6325638555717505, "kl": 0.4317626953125, "learning_rate": 6.661931407662291e-07, "loss": 0.1424, "reward": 12.10312506556511, "reward_std": 7.089647740125656, "rewards/accuracy_reward_staging": 1.0399999842047691, "rewards/format_reward": 0.890625, "rewards/format_reward_staging": 0.8125, "step": 259 }, { "epoch": 259.8, "grad_norm": 1.4896714438939402, "learning_rate": 6.579798566743313e-07, "loss": 0.2349, "step": 260 }, { "epoch": 259.8, "eval_clip_ratio": 0.0, "eval_completion_length": 904.775, "eval_kl": 0.505517578125, "eval_loss": 0.21720127761363983, "eval_reward": 11.496249973773956, "eval_reward_std": 7.962953209877014, "eval_rewards/accuracy_reward_staging": 1.0021250128746033, "eval_rewards/format_reward": 0.825, "eval_rewards/format_reward_staging": 0.65, "eval_runtime": 252.62, "eval_samples_per_second": 0.079, "eval_steps_per_second": 0.02, "step": 260 }, { "clip_ratio": 0.0, "completion_length": 851.5546875, "epoch": 260.8, "grad_norm": 1.4904521266628086, "kl": 0.43194580078125, "learning_rate": 6.497926187405324e-07, "loss": 0.1751, "reward": 12.633593738079071, "reward_std": 7.350790940225124, "rewards/accuracy_reward_staging": 1.1102343881502748, "rewards/format_reward": 0.8046875, "rewards/format_reward_staging": 0.7265625, "step": 261 }, { "clip_ratio": 0.0, "completion_length": 854.90625, "epoch": 261.8, "grad_norm": 1.329154678834471, "kl": 0.3721923828125, "learning_rate": 6.416320504546997e-07, "loss": 0.1341, "reward": 12.78281256556511, "reward_std": 6.5289479941129684, "rewards/accuracy_reward_staging": 1.1282812533900142, "rewards/format_reward": 0.765625, "rewards/format_reward_staging": 0.734375, "step": 262 }, { "clip_ratio": 0.0, "completion_length": 775.671875, "epoch": 262.8, "grad_norm": 1.7527252884904974, "kl": 0.3619384765625, "learning_rate": 6.334987732757028e-07, "loss": 0.2659, "reward": 11.443749904632568, "reward_std": 6.490789204835892, "rewards/accuracy_reward_staging": 0.9881250048056245, "rewards/format_reward": 0.828125, "rewards/format_reward_staging": 0.734375, "step": 263 }, { "clip_ratio": 0.0, "completion_length": 837.03125, "epoch": 263.8, "grad_norm": 1.5611285133893833, "kl": 0.426025390625, "learning_rate": 6.253934065840879e-07, "loss": 0.1938, "reward": 10.160937517881393, "reward_std": 6.217289835214615, "rewards/accuracy_reward_staging": 0.8582812617532909, "rewards/format_reward": 0.875, "rewards/format_reward_staging": 0.703125, "step": 264 }, { "clip_ratio": 0.0, "completion_length": 982.140625, "epoch": 264.8, "grad_norm": 1.6308836635992412, "kl": 0.4444580078125, "learning_rate": 6.173165676349102e-07, "loss": 0.3391, "reward": 11.560937523841858, "reward_std": 8.43397456407547, "rewards/accuracy_reward_staging": 1.0092187635600567, "rewards/format_reward": 0.78125, "rewards/format_reward_staging": 0.6875, "step": 265 }, { "clip_ratio": 0.0, "completion_length": 917.21875, "epoch": 265.8, "grad_norm": 8.538915516124892, "kl": 0.4678955078125, "learning_rate": 6.092688715107263e-07, "loss": 0.1789, "reward": 12.584374994039536, "reward_std": 5.365057937800884, "rewards/accuracy_reward_staging": 1.1021875068545341, "rewards/format_reward": 0.859375, "rewards/format_reward_staging": 0.703125, "step": 266 }, { "clip_ratio": 0.0, "completion_length": 978.140625, "epoch": 266.8, "grad_norm": 11.685381653606097, "kl": 0.5882568359375, "learning_rate": 6.012509310747538e-07, "loss": 0.191, "reward": 10.77499994635582, "reward_std": 7.996917471289635, "rewards/accuracy_reward_staging": 0.9415625259280205, "rewards/format_reward": 0.75, "rewards/format_reward_staging": 0.609375, "step": 267 }, { "clip_ratio": 0.0, "completion_length": 925.015625, "epoch": 267.8, "grad_norm": 1.4792658565609935, "kl": 0.4212646484375, "learning_rate": 5.932633569241999e-07, "loss": 0.1863, "reward": 11.715624958276749, "reward_std": 6.7979661747813225, "rewards/accuracy_reward_staging": 1.0200000181794167, "rewards/format_reward": 0.84375, "rewards/format_reward_staging": 0.671875, "step": 268 }, { "clip_ratio": 0.0, "completion_length": 845.140625, "epoch": 268.8, "grad_norm": 5.20673071015397, "kl": 0.4212646484375, "learning_rate": 5.853067573437611e-07, "loss": 0.3561, "reward": 13.192187458276749, "reward_std": 7.627192087471485, "rewards/accuracy_reward_staging": 1.164531260728836, "rewards/format_reward": 0.828125, "rewards/format_reward_staging": 0.71875, "step": 269 }, { "epoch": 269.8, "grad_norm": 1.3473441966408635, "learning_rate": 5.773817382593007e-07, "loss": 0.2184, "step": 270 }, { "epoch": 269.8, "eval_clip_ratio": 0.0, "eval_completion_length": 896.7625, "eval_kl": 0.446728515625, "eval_loss": 0.2557651400566101, "eval_reward": 11.58375017642975, "eval_reward_std": 7.534060525894165, "eval_rewards/accuracy_reward_staging": 1.0096250101923943, "eval_rewards/format_reward": 0.8, "eval_rewards/format_reward_staging": 0.6875, "eval_runtime": 257.488, "eval_samples_per_second": 0.078, "eval_steps_per_second": 0.019, "step": 270 }, { "clip_ratio": 0.0, "completion_length": 945.8359375, "epoch": 270.8, "grad_norm": 1.8166241750746335, "kl": 0.4503173828125, "learning_rate": 5.694889031917046e-07, "loss": 0.2195, "reward": 10.942968711256981, "reward_std": 7.070375669747591, "rewards/accuracy_reward_staging": 0.9497656342573464, "rewards/format_reward": 0.7890625, "rewards/format_reward_staging": 0.65625, "step": 271 }, { "clip_ratio": 0.0, "completion_length": 860.984375, "epoch": 271.8, "grad_norm": 1.595405733714329, "kl": 0.455810546875, "learning_rate": 5.616288532109224e-07, "loss": 0.2539, "reward": 11.723437696695328, "reward_std": 7.531994827091694, "rewards/accuracy_reward_staging": 1.0301562547683716, "rewards/format_reward": 0.75, "rewards/format_reward_staging": 0.671875, "step": 272 }, { "clip_ratio": 0.0, "completion_length": 892.765625, "epoch": 272.8, "grad_norm": 1.9592298387429952, "kl": 0.4637451171875, "learning_rate": 5.538021868901912e-07, "loss": 0.2683, "reward": 10.093750059604645, "reward_std": 6.664774626493454, "rewards/accuracy_reward_staging": 0.8687500087544322, "rewards/format_reward": 0.796875, "rewards/format_reward_staging": 0.609375, "step": 273 }, { "clip_ratio": 0.0, "completion_length": 958.265625, "epoch": 273.8, "grad_norm": 98.566029551543, "kl": 1.8087158203125, "learning_rate": 5.460095002604532e-07, "loss": 0.3116, "reward": 9.573437586426735, "reward_std": 7.937933571636677, "rewards/accuracy_reward_staging": 0.8276562560349703, "rewards/format_reward": 0.71875, "rewards/format_reward_staging": 0.578125, "step": 274 }, { "clip_ratio": 0.0, "completion_length": 915.875, "epoch": 274.8, "grad_norm": 1.54560131594488, "kl": 0.4423828125, "learning_rate": 5.382513867649663e-07, "loss": 0.2622, "reward": 10.931249961256981, "reward_std": 7.524611636996269, "rewards/accuracy_reward_staging": 0.9571875012479722, "rewards/format_reward": 0.75, "rewards/format_reward_staging": 0.609375, "step": 275 }, { "clip_ratio": 0.0, "completion_length": 856.765625, "epoch": 275.8, "grad_norm": 3.0239354302219548, "kl": 0.5811767578125, "learning_rate": 5.305284372141095e-07, "loss": 0.2451, "reward": 11.835937544703484, "reward_std": 6.662468932569027, "rewards/accuracy_reward_staging": 1.039843776728958, "rewards/format_reward": 0.734375, "rewards/format_reward_staging": 0.703125, "step": 276 }, { "clip_ratio": 0.0, "completion_length": 932.6875, "epoch": 276.8, "grad_norm": 1.616305493128834, "kl": 0.538818359375, "learning_rate": 5.228412397403915e-07, "loss": 0.2794, "reward": 9.204687595367432, "reward_std": 8.746634840965271, "rewards/accuracy_reward_staging": 0.7923437561839819, "rewards/format_reward": 0.734375, "rewards/format_reward_staging": 0.546875, "step": 277 }, { "clip_ratio": 0.0, "completion_length": 853.65625, "epoch": 277.8, "grad_norm": 1.5898137384166846, "kl": 0.4730224609375, "learning_rate": 5.15190379753663e-07, "loss": 0.2771, "reward": 9.235937505960464, "reward_std": 8.327273309230804, "rewards/accuracy_reward_staging": 0.786093763075769, "rewards/format_reward": 0.796875, "rewards/format_reward_staging": 0.578125, "step": 278 }, { "clip_ratio": 0.0, "completion_length": 823.4375, "epoch": 278.8, "grad_norm": 2.44549487611301, "kl": 0.52734375, "learning_rate": 5.07576439896533e-07, "loss": 0.2175, "reward": 11.657812714576721, "reward_std": 8.151460975408554, "rewards/accuracy_reward_staging": 1.0251562464982271, "rewards/format_reward": 0.75, "rewards/format_reward_staging": 0.65625, "step": 279 }, { "epoch": 279.8, "grad_norm": 2.0646627929666184, "learning_rate": 5.000000000000002e-07, "loss": 0.357, "step": 280 }, { "epoch": 279.8, "eval_clip_ratio": 0.0, "eval_completion_length": 810.45, "eval_kl": 0.4798828125, "eval_loss": 0.1928146332502365, "eval_reward": 12.850000190734864, "eval_reward_std": 7.15499917268753, "eval_rewards/accuracy_reward_staging": 1.1475000128149986, "eval_rewards/format_reward": 0.725, "eval_rewards/format_reward_staging": 0.65, "eval_runtime": 242.0902, "eval_samples_per_second": 0.083, "eval_steps_per_second": 0.021, "step": 280 }, { "clip_ratio": 0.0, "completion_length": 885.203125, "epoch": 280.8, "grad_norm": 1.6342686921474878, "kl": 0.453857421875, "learning_rate": 4.924616370392961e-07, "loss": 0.2134, "reward": 11.285156175494194, "reward_std": 7.196100067347288, "rewards/accuracy_reward_staging": 0.9832031358964741, "rewards/format_reward": 0.8046875, "rewards/format_reward_staging": 0.6484375, "step": 281 }, { "clip_ratio": 0.0, "completion_length": 827.796875, "epoch": 281.8, "grad_norm": 1.9777507622588804, "kl": 0.501708984375, "learning_rate": 4.849619250899458e-07, "loss": 0.2286, "reward": 9.923437535762787, "reward_std": 6.76015942543745, "rewards/accuracy_reward_staging": 0.8532812669873238, "rewards/format_reward": 0.75, "rewards/format_reward_staging": 0.640625, "step": 282 }, { "clip_ratio": 0.0, "completion_length": 760.53125, "epoch": 282.8, "grad_norm": 1.983747741222898, "kl": 0.5208740234375, "learning_rate": 4.775014352840512e-07, "loss": 0.2496, "reward": 10.876562595367432, "reward_std": 8.217148587107658, "rewards/accuracy_reward_staging": 0.9735937742516398, "rewards/format_reward": 0.609375, "rewards/format_reward_staging": 0.53125, "step": 283 }, { "clip_ratio": 0.0, "completion_length": 800.21875, "epoch": 283.8, "grad_norm": 9.98641000305957, "kl": 0.650634765625, "learning_rate": 4.700807357667952e-07, "loss": 0.224, "reward": 10.59218743443489, "reward_std": 8.216856330633163, "rewards/accuracy_reward_staging": 0.910781248472631, "rewards/format_reward": 0.84375, "rewards/format_reward_staging": 0.640625, "step": 284 }, { "clip_ratio": 0.0, "completion_length": 724.0625, "epoch": 284.8, "grad_norm": 3.1107143716405066, "kl": 0.58544921875, "learning_rate": 4.62700391653176e-07, "loss": 0.1371, "reward": 12.684374868869781, "reward_std": 5.954333983361721, "rewards/accuracy_reward_staging": 1.1121875066310167, "rewards/format_reward": 0.796875, "rewards/format_reward_staging": 0.765625, "step": 285 }, { "clip_ratio": 0.0, "completion_length": 673.5, "epoch": 285.8, "grad_norm": 1.6272052880635073, "kl": 0.486572265625, "learning_rate": 4.5536096498497287e-07, "loss": 0.1643, "reward": 10.56093743443489, "reward_std": 8.084580287337303, "rewards/accuracy_reward_staging": 0.9310937505215406, "rewards/format_reward": 0.6875, "rewards/format_reward_staging": 0.5625, "step": 286 }, { "clip_ratio": 0.0, "completion_length": 651.046875, "epoch": 286.8, "grad_norm": 44.36285030093844, "kl": 0.7974853515625, "learning_rate": 4.480630146879418e-07, "loss": 0.161, "reward": 11.729687303304672, "reward_std": 8.256633162498474, "rewards/accuracy_reward_staging": 1.02453126385808, "rewards/format_reward": 0.78125, "rewards/format_reward_staging": 0.703125, "step": 287 }, { "clip_ratio": 0.0, "completion_length": 694.03125, "epoch": 287.8, "grad_norm": 1.805744925603759, "kl": 0.5167236328125, "learning_rate": 4.408070965292533e-07, "loss": 0.1719, "reward": 12.934375196695328, "reward_std": 8.110588558018208, "rewards/accuracy_reward_staging": 1.149687498807907, "rewards/format_reward": 0.734375, "rewards/format_reward_staging": 0.703125, "step": 288 }, { "clip_ratio": 0.0, "completion_length": 806.671875, "epoch": 288.8, "grad_norm": 365.24516465790214, "kl": 15.93310546875, "learning_rate": 4.335937630751674e-07, "loss": 0.422, "reward": 10.040624976158142, "reward_std": 6.964074335992336, "rewards/accuracy_reward_staging": 0.8900000145658851, "rewards/format_reward": 0.609375, "rewards/format_reward_staging": 0.53125, "step": 289 }, { "epoch": 289.8, "grad_norm": 2.018184843282375, "learning_rate": 4.2642356364895417e-07, "loss": 0.159, "step": 290 }, { "epoch": 289.8, "eval_clip_ratio": 0.0, "eval_completion_length": 676.325, "eval_kl": 0.47177734375, "eval_loss": 0.08975062519311905, "eval_reward": 12.25, "eval_reward_std": 7.105983757972718, "eval_rewards/accuracy_reward_staging": 1.080000001192093, "eval_rewards/format_reward": 0.75, "eval_rewards/format_reward_staging": 0.7, "eval_runtime": 172.0863, "eval_samples_per_second": 0.116, "eval_steps_per_second": 0.029, "step": 290 }, { "clip_ratio": 0.0, "completion_length": 756.625, "epoch": 290.8, "grad_norm": 4.304003408743737, "kl": 0.55126953125, "learning_rate": 4.192970442890602e-07, "loss": 0.1843, "reward": 10.37187498062849, "reward_std": 6.766278941184282, "rewards/accuracy_reward_staging": 0.9106250035110861, "rewards/format_reward": 0.671875, "rewards/format_reward_staging": 0.59375, "step": 291 }, { "clip_ratio": 0.0, "completion_length": 722.5, "epoch": 291.8, "grad_norm": 1.4716878010716028, "kl": 0.4508056640625, "learning_rate": 4.1221474770752696e-07, "loss": 0.1393, "reward": 12.068749815225601, "reward_std": 6.005900785326958, "rewards/accuracy_reward_staging": 1.0553124994039536, "rewards/format_reward": 0.8125, "rewards/format_reward_staging": 0.703125, "step": 292 }, { "clip_ratio": 0.0, "completion_length": 757.15625, "epoch": 292.8, "grad_norm": 1.6973289397039133, "kl": 0.4842529296875, "learning_rate": 4.0517721324865884e-07, "loss": 0.2115, "reward": 9.153125151991844, "reward_std": 7.310723379254341, "rewards/accuracy_reward_staging": 0.7809374900534749, "rewards/format_reward": 0.765625, "rewards/format_reward_staging": 0.578125, "step": 293 }, { "clip_ratio": 0.0, "completion_length": 700.765625, "epoch": 293.8, "grad_norm": 2.1555090676393096, "kl": 0.51806640625, "learning_rate": 3.981849768479516e-07, "loss": 0.2632, "reward": 11.881249964237213, "reward_std": 7.455913960933685, "rewards/accuracy_reward_staging": 1.0396875217556953, "rewards/format_reward": 0.78125, "rewards/format_reward_staging": 0.703125, "step": 294 }, { "clip_ratio": 0.0, "completion_length": 643.59375, "epoch": 294.8, "grad_norm": 1.6704306436530678, "kl": 0.459716796875, "learning_rate": 3.912385709912793e-07, "loss": 0.1352, "reward": 11.445312559604645, "reward_std": 7.4289940893650055, "rewards/accuracy_reward_staging": 1.008593775331974, "rewards/format_reward": 0.703125, "rewards/format_reward_staging": 0.65625, "step": 295 }, { "clip_ratio": 0.0, "completion_length": 684.578125, "epoch": 295.8, "grad_norm": 1.7855400540888517, "kl": 0.528076171875, "learning_rate": 3.843385246743417e-07, "loss": 0.1663, "reward": 9.97968752682209, "reward_std": 8.184111461043358, "rewards/accuracy_reward_staging": 0.8635937534272671, "rewards/format_reward": 0.734375, "rewards/format_reward_staging": 0.609375, "step": 296 }, { "clip_ratio": 0.0, "completion_length": 602.75, "epoch": 296.8, "grad_norm": 38.11905446217515, "kl": 1.2889404296875, "learning_rate": 3.774853633623806e-07, "loss": 0.2038, "reward": 11.821874991059303, "reward_std": 8.198160663247108, "rewards/accuracy_reward_staging": 1.0400000140070915, "rewards/format_reward": 0.75, "rewards/format_reward_staging": 0.671875, "step": 297 }, { "clip_ratio": 0.0, "completion_length": 585.765625, "epoch": 297.8, "grad_norm": 1.8197669464076098, "kl": 0.5150146484375, "learning_rate": 3.706796089501627e-07, "loss": 0.0749, "reward": 11.434375017881393, "reward_std": 7.715316243469715, "rewards/accuracy_reward_staging": 0.9934375211596489, "rewards/format_reward": 0.796875, "rewards/format_reward_staging": 0.703125, "step": 298 }, { "clip_ratio": 0.0, "completion_length": 613.03125, "epoch": 298.8, "grad_norm": 1.889354762217911, "kl": 0.4669189453125, "learning_rate": 3.639217797222359e-07, "loss": 0.1515, "reward": 11.481249898672104, "reward_std": 7.798143312335014, "rewards/accuracy_reward_staging": 1.0090625192970037, "rewards/format_reward": 0.71875, "rewards/format_reward_staging": 0.671875, "step": 299 }, { "epoch": 299.8, "grad_norm": 2.6622348023095666, "learning_rate": 3.5721239031346063e-07, "loss": 0.2454, "step": 300 }, { "epoch": 299.8, "eval_clip_ratio": 0.0, "eval_completion_length": 590.3125, "eval_kl": 0.5787109375, "eval_loss": 0.10901384055614471, "eval_reward": 11.113749873638152, "eval_reward_std": 8.458426451683044, "eval_rewards/accuracy_reward_staging": 0.9851250126957893, "eval_rewards/format_reward": 0.65, "eval_rewards/format_reward_staging": 0.6125, "eval_runtime": 141.3322, "eval_samples_per_second": 0.142, "eval_steps_per_second": 0.035, "step": 300 }, { "clip_ratio": 0.0, "completion_length": 614.75, "epoch": 300.8, "grad_norm": 1.5247800909935867, "kl": 0.50823974609375, "learning_rate": 3.5055195166981646e-07, "loss": 0.0888, "reward": 11.17812480777502, "reward_std": 7.858443755656481, "rewards/accuracy_reward_staging": 0.9834375130012631, "rewards/format_reward": 0.7421875, "rewards/format_reward_staging": 0.6015625, "step": 301 }, { "clip_ratio": 0.0, "completion_length": 590.609375, "epoch": 301.8, "grad_norm": 11.67646802939455, "kl": 0.736083984375, "learning_rate": 3.4394097100949283e-07, "loss": 0.119, "reward": 11.040624797344208, "reward_std": 6.185135334730148, "rewards/accuracy_reward_staging": 0.9775000158697367, "rewards/format_reward": 0.65625, "rewards/format_reward_staging": 0.609375, "step": 302 }, { "clip_ratio": 0.0, "completion_length": 592.828125, "epoch": 302.8, "grad_norm": 2.07620138910771, "kl": 0.5858154296875, "learning_rate": 3.373799517842627e-07, "loss": 0.1362, "reward": 9.728124901652336, "reward_std": 6.484044134616852, "rewards/accuracy_reward_staging": 0.8431250131689012, "rewards/format_reward": 0.671875, "rewards/format_reward_staging": 0.625, "step": 303 }, { "clip_ratio": 0.0, "completion_length": 590.421875, "epoch": 303.8, "grad_norm": 1.80968722219858, "kl": 0.577392578125, "learning_rate": 3.308693936411421e-07, "loss": 0.1018, "reward": 9.63281275331974, "reward_std": 6.755122885107994, "rewards/accuracy_reward_staging": 0.827343761920929, "rewards/format_reward": 0.75, "rewards/format_reward_staging": 0.609375, "step": 304 }, { "clip_ratio": 0.0, "completion_length": 568.5625, "epoch": 304.8, "grad_norm": 2.0406619631130307, "kl": 0.5341796875, "learning_rate": 3.2440979238433974e-07, "loss": 0.1264, "reward": 10.746875032782555, "reward_std": 7.6421735137701035, "rewards/accuracy_reward_staging": 0.9465625076554716, "rewards/format_reward": 0.671875, "rewards/format_reward_staging": 0.609375, "step": 305 }, { "clip_ratio": 0.0, "completion_length": 624.703125, "epoch": 305.8, "grad_norm": 2524.9442790112334, "kl": 23.13818359375, "learning_rate": 3.180016399375016e-07, "loss": 0.9704, "reward": 10.040624886751175, "reward_std": 7.3795405626297, "rewards/accuracy_reward_staging": 0.8853125032037497, "rewards/format_reward": 0.640625, "rewards/format_reward_staging": 0.546875, "step": 306 }, { "clip_ratio": 0.0, "completion_length": 599.765625, "epoch": 306.8, "grad_norm": 22.332538208315107, "kl": 0.8387451171875, "learning_rate": 3.1164542430624586e-07, "loss": 0.1017, "reward": 11.599999964237213, "reward_std": 6.432632386684418, "rewards/accuracy_reward_staging": 1.035000003874302, "rewards/format_reward": 0.625, "rewards/format_reward_staging": 0.625, "step": 307 }, { "clip_ratio": 0.0, "completion_length": 612.078125, "epoch": 307.8, "grad_norm": 103.35305685554316, "kl": 0.972900390625, "learning_rate": 3.0534162954100263e-07, "loss": 0.1358, "reward": 9.193749964237213, "reward_std": 7.3355728685855865, "rewards/accuracy_reward_staging": 0.7959375064820051, "rewards/format_reward": 0.65625, "rewards/format_reward_staging": 0.578125, "step": 308 }, { "clip_ratio": 0.0, "completion_length": 570.96875, "epoch": 308.8, "grad_norm": 50.34680954325702, "kl": 1.0068359375, "learning_rate": 2.990907357001491e-07, "loss": 0.0898, "reward": 9.468749985098839, "reward_std": 7.638161733746529, "rewards/accuracy_reward_staging": 0.8328124992549419, "rewards/format_reward": 0.59375, "rewards/format_reward_staging": 0.546875, "step": 309 }, { "epoch": 309.8, "grad_norm": 2.404150826295122, "learning_rate": 2.9289321881345254e-07, "loss": 0.125, "step": 310 }, { "epoch": 309.8, "eval_clip_ratio": 0.0, "eval_completion_length": 544.0375, "eval_kl": 0.5705078125, "eval_loss": 0.10134752094745636, "eval_reward": 11.172499823570252, "eval_reward_std": 7.30764594078064, "eval_rewards/accuracy_reward_staging": 0.9935000017285347, "eval_rewards/format_reward": 0.625, "eval_rewards/format_reward_staging": 0.6125, "eval_runtime": 125.2508, "eval_samples_per_second": 0.16, "eval_steps_per_second": 0.04, "step": 310 }, { "clip_ratio": 0.0, "completion_length": 594.6796875, "epoch": 310.8, "grad_norm": 4.496947390349957, "kl": 0.5538330078125, "learning_rate": 2.867495508458185e-07, "loss": 0.0803, "reward": 11.534374952316284, "reward_std": 6.893470458686352, "rewards/accuracy_reward_staging": 1.0221875067800283, "rewards/format_reward": 0.671875, "rewards/format_reward_staging": 0.640625, "step": 311 }, { "clip_ratio": 0.0, "completion_length": 604.53125, "epoch": 311.8, "grad_norm": 1.8183482676019072, "kl": 0.561279296875, "learning_rate": 2.8066019966134904e-07, "loss": 0.1272, "reward": 10.914062589406967, "reward_std": 7.2590411230921745, "rewards/accuracy_reward_staging": 0.9617187697440386, "rewards/format_reward": 0.65625, "rewards/format_reward_staging": 0.640625, "step": 312 }, { "clip_ratio": 0.0, "completion_length": 588.84375, "epoch": 312.8, "grad_norm": 2.2241952673115684, "kl": 0.574951171875, "learning_rate": 2.7462562898771256e-07, "loss": 0.0993, "reward": 11.553124994039536, "reward_std": 8.42781974375248, "rewards/accuracy_reward_staging": 1.0303124962374568, "rewards/format_reward": 0.671875, "rewards/format_reward_staging": 0.578125, "step": 313 }, { "clip_ratio": 0.0, "completion_length": 551.140625, "epoch": 313.8, "grad_norm": 10.414899131526484, "kl": 0.54833984375, "learning_rate": 2.6864629838082954e-07, "loss": 0.0799, "reward": 11.815624713897705, "reward_std": 6.541705533862114, "rewards/accuracy_reward_staging": 1.0659374967217445, "rewards/format_reward": 0.59375, "rewards/format_reward_staging": 0.5625, "step": 314 }, { "clip_ratio": 0.0, "completion_length": 553.046875, "epoch": 314.8, "grad_norm": 47.0803214727609, "kl": 0.74169921875, "learning_rate": 2.62722663189876e-07, "loss": 0.082, "reward": 8.54062494635582, "reward_std": 8.188691228628159, "rewards/accuracy_reward_staging": 0.7603125041350722, "rewards/format_reward": 0.453125, "rewards/format_reward_staging": 0.484375, "step": 315 }, { "clip_ratio": 0.0, "completion_length": 568.03125, "epoch": 315.8, "grad_norm": 1.697069435904432, "kl": 0.52001953125, "learning_rate": 2.568551745226056e-07, "loss": 0.0445, "reward": 11.301562577486038, "reward_std": 7.986581727862358, "rewards/accuracy_reward_staging": 0.9989062640815973, "rewards/format_reward": 0.65625, "rewards/format_reward_staging": 0.65625, "step": 316 }, { "clip_ratio": 0.0, "completion_length": 604.609375, "epoch": 316.8, "grad_norm": 1.9224538433799612, "kl": 0.5042724609375, "learning_rate": 2.510442792109978e-07, "loss": 0.1747, "reward": 11.126562386751175, "reward_std": 8.233518898487091, "rewards/accuracy_reward_staging": 0.9829687615856528, "rewards/format_reward": 0.703125, "rewards/format_reward_staging": 0.59375, "step": 317 }, { "clip_ratio": 0.0, "completion_length": 540.96875, "epoch": 317.8, "grad_norm": 1.8852768343458246, "kl": 0.50634765625, "learning_rate": 2.45290419777228e-07, "loss": 0.0711, "reward": 10.67187511920929, "reward_std": 7.687072329223156, "rewards/accuracy_reward_staging": 0.9312500208616257, "rewards/format_reward": 0.6875, "rewards/format_reward_staging": 0.671875, "step": 318 }, { "clip_ratio": 0.0, "completion_length": 586.21875, "epoch": 318.8, "grad_norm": 1.9063098360507449, "kl": 0.60400390625, "learning_rate": 2.395940343999691e-07, "loss": 0.072, "reward": 9.948437467217445, "reward_std": 8.150914326310158, "rewards/accuracy_reward_staging": 0.8745312532410026, "rewards/format_reward": 0.65625, "rewards/format_reward_staging": 0.546875, "step": 319 }, { "epoch": 319.8, "grad_norm": 2.0586826121884196, "learning_rate": 2.339555568810221e-07, "loss": 0.0995, "step": 320 }, { "epoch": 319.8, "eval_clip_ratio": 0.0, "eval_completion_length": 624.5875, "eval_kl": 1.31884765625, "eval_loss": 0.14124900102615356, "eval_reward": 10.472500038146972, "eval_reward_std": 8.316550683975219, "eval_rewards/accuracy_reward_staging": 0.9397500105202198, "eval_rewards/format_reward": 0.55, "eval_rewards/format_reward_staging": 0.525, "eval_runtime": 166.2792, "eval_samples_per_second": 0.12, "eval_steps_per_second": 0.03, "step": 320 }, { "clip_ratio": 0.0, "completion_length": 531.609375, "epoch": 320.8, "grad_norm": 3.9996163379427734, "kl": 0.682861328125, "learning_rate": 2.283754166122802e-07, "loss": 0.1015, "reward": 11.93515631556511, "reward_std": 7.991424214094877, "rewards/accuracy_reward_staging": 1.0638281423598528, "rewards/format_reward": 0.65625, "rewards/format_reward_staging": 0.640625, "step": 321 }, { "clip_ratio": 0.0, "completion_length": 536.390625, "epoch": 321.8, "grad_norm": 3.0819342365087095, "kl": 0.537841796875, "learning_rate": 2.228540385430291e-07, "loss": 0.0322, "reward": 10.235937386751175, "reward_std": 7.312522612512112, "rewards/accuracy_reward_staging": 0.9110937523655593, "rewards/format_reward": 0.5625, "rewards/format_reward_staging": 0.5625, "step": 322 }, { "clip_ratio": 0.0, "completion_length": 564.328125, "epoch": 322.8, "grad_norm": 2.26675304418193, "kl": 0.61669921875, "learning_rate": 2.1739184314758607e-07, "loss": 0.1112, "reward": 9.662499949336052, "reward_std": 7.8978844210505486, "rewards/accuracy_reward_staging": 0.8537500146776438, "rewards/format_reward": 0.578125, "rewards/format_reward_staging": 0.546875, "step": 323 }, { "clip_ratio": 0.0, "completion_length": 561.875, "epoch": 323.8, "grad_norm": 1.9954959778735344, "kl": 0.582763671875, "learning_rate": 2.1198924639327808e-07, "loss": 0.1118, "reward": 9.614062517881393, "reward_std": 8.075859874486923, "rewards/accuracy_reward_staging": 0.848906246945262, "rewards/format_reward": 0.578125, "rewards/format_reward_staging": 0.546875, "step": 324 }, { "clip_ratio": 0.0, "completion_length": 513.0625, "epoch": 324.8, "grad_norm": 1.9291434012665167, "kl": 0.508056640625, "learning_rate": 2.0664665970876495e-07, "loss": 0.09, "reward": 11.331250041723251, "reward_std": 7.7114517986774445, "rewards/accuracy_reward_staging": 0.9878125172108412, "rewards/format_reward": 0.765625, "rewards/format_reward_staging": 0.6875, "step": 325 }, { "clip_ratio": 0.0, "completion_length": 531.9375, "epoch": 325.8, "grad_norm": 2.130045177921353, "kl": 0.6065673828125, "learning_rate": 2.0136448995270738e-07, "loss": 0.1138, "reward": 9.910937532782555, "reward_std": 7.951791629195213, "rewards/accuracy_reward_staging": 0.8645312692970037, "rewards/format_reward": 0.65625, "rewards/format_reward_staging": 0.609375, "step": 326 }, { "clip_ratio": 0.0, "completion_length": 590.3125, "epoch": 326.8, "grad_norm": 14.919466822673119, "kl": 0.8441162109375, "learning_rate": 1.961431393827827e-07, "loss": 0.1238, "reward": 9.217187523841858, "reward_std": 7.790774069726467, "rewards/accuracy_reward_staging": 0.7967187650501728, "rewards/format_reward": 0.65625, "rewards/format_reward_staging": 0.59375, "step": 327 }, { "clip_ratio": 0.0, "completion_length": 581.578125, "epoch": 327.8, "grad_norm": 2.201816976112108, "kl": 0.5943603515625, "learning_rate": 1.9098300562505264e-07, "loss": 0.1018, "reward": 10.471874967217445, "reward_std": 7.630053393542767, "rewards/accuracy_reward_staging": 0.9268750208429992, "rewards/format_reward": 0.640625, "rewards/format_reward_staging": 0.5625, "step": 328 }, { "clip_ratio": 0.0, "completion_length": 556.71875, "epoch": 328.8, "grad_norm": 5.906945093072998, "kl": 0.6585693359375, "learning_rate": 1.8588448164368087e-07, "loss": 0.0954, "reward": 10.989062532782555, "reward_std": 7.685741938650608, "rewards/accuracy_reward_staging": 0.972343759611249, "rewards/format_reward": 0.671875, "rewards/format_reward_staging": 0.59375, "step": 329 }, { "epoch": 329.8, "grad_norm": 2.0853287849886426, "learning_rate": 1.8084795571100809e-07, "loss": 0.0689, "step": 330 }, { "epoch": 329.8, "eval_clip_ratio": 0.0, "eval_completion_length": 538.2375, "eval_kl": 0.81767578125, "eval_loss": 0.1242348700761795, "eval_reward": 11.619999825954437, "eval_reward_std": 8.338389962911606, "eval_rewards/accuracy_reward_staging": 1.0357500161975621, "eval_rewards/format_reward": 0.6375, "eval_rewards/format_reward_staging": 0.625, "eval_runtime": 130.5157, "eval_samples_per_second": 0.153, "eval_steps_per_second": 0.038, "step": 330 }, { "clip_ratio": 0.0, "completion_length": 552.21875, "epoch": 330.8, "grad_norm": 1.9411099886639303, "kl": 0.55401611328125, "learning_rate": 1.758738113779843e-07, "loss": 0.1164, "reward": 12.410937391221523, "reward_std": 6.827382728457451, "rewards/accuracy_reward_staging": 1.1129687398206443, "rewards/format_reward": 0.65625, "rewards/format_reward_staging": 0.625, "step": 331 }, { "clip_ratio": 0.0, "completion_length": 550.75, "epoch": 331.8, "grad_norm": 2.1891389339166056, "kl": 0.5709228515625, "learning_rate": 1.7096242744495838e-07, "loss": -0.0025, "reward": 11.460937261581421, "reward_std": 6.8272934183478355, "rewards/accuracy_reward_staging": 1.0195312476716936, "rewards/format_reward": 0.671875, "rewards/format_reward_staging": 0.59375, "step": 332 }, { "clip_ratio": 0.0, "completion_length": 524.5, "epoch": 332.8, "grad_norm": 2.0777537885395376, "kl": 0.536376953125, "learning_rate": 1.661141779328319e-07, "loss": 0.0535, "reward": 12.75, "reward_std": 7.35980150103569, "rewards/accuracy_reward_staging": 1.1328125018626451, "rewards/format_reward": 0.71875, "rewards/format_reward_staging": 0.703125, "step": 333 }, { "clip_ratio": 0.0, "completion_length": 499.59375, "epoch": 333.8, "grad_norm": 4.1435358861337095, "kl": 0.5528564453125, "learning_rate": 1.6132943205457606e-07, "loss": 0.0848, "reward": 10.412499874830246, "reward_std": 8.24751353263855, "rewards/accuracy_reward_staging": 0.9053125167265534, "rewards/format_reward": 0.71875, "rewards/format_reward_staging": 0.640625, "step": 334 }, { "clip_ratio": 0.0, "completion_length": 528.609375, "epoch": 334.8, "grad_norm": 2.837368202419978, "kl": 0.573486328125, "learning_rate": 1.566085541871145e-07, "loss": 0.0901, "reward": 13.03749993443489, "reward_std": 7.35784338414669, "rewards/accuracy_reward_staging": 1.1756250127218664, "rewards/format_reward": 0.640625, "rewards/format_reward_staging": 0.640625, "step": 335 }, { "clip_ratio": 0.0, "completion_length": 536.09375, "epoch": 335.8, "grad_norm": 1.9595695246588565, "kl": 0.5914306640625, "learning_rate": 1.5195190384357404e-07, "loss": 0.0832, "reward": 9.493749916553497, "reward_std": 6.751585811376572, "rewards/accuracy_reward_staging": 0.8478124998509884, "rewards/format_reward": 0.546875, "rewards/format_reward_staging": 0.46875, "step": 336 }, { "clip_ratio": 0.0, "completion_length": 519.390625, "epoch": 336.8, "grad_norm": 2.110347695686011, "kl": 0.519287109375, "learning_rate": 1.473598356459078e-07, "loss": 0.0886, "reward": 12.745312631130219, "reward_std": 7.440838478505611, "rewards/accuracy_reward_staging": 1.122968764975667, "rewards/format_reward": 0.796875, "rewards/format_reward_staging": 0.71875, "step": 337 }, { "clip_ratio": 0.0, "completion_length": 547.65625, "epoch": 337.8, "grad_norm": 1.727304431939535, "kl": 0.4608154296875, "learning_rate": 1.4283269929788776e-07, "loss": 0.0791, "reward": 12.464062497019768, "reward_std": 6.970313638448715, "rewards/accuracy_reward_staging": 1.1073437612503767, "rewards/format_reward": 0.703125, "rewards/format_reward_staging": 0.6875, "step": 338 }, { "clip_ratio": 0.0, "completion_length": 516.578125, "epoch": 338.8, "grad_norm": 45.73313544647479, "kl": 0.8197021484375, "learning_rate": 1.3837083955847417e-07, "loss": 0.102, "reward": 11.170312657952309, "reward_std": 7.535245016217232, "rewards/accuracy_reward_staging": 0.9779687668196857, "rewards/format_reward": 0.71875, "rewards/format_reward_staging": 0.671875, "step": 339 }, { "epoch": 339.8, "grad_norm": 2.6477795175870518, "learning_rate": 1.3397459621556128e-07, "loss": 0.1162, "step": 340 }, { "epoch": 339.8, "eval_clip_ratio": 0.0, "eval_completion_length": 529.1375, "eval_kl": 23.68701171875, "eval_loss": 1.2650219202041626, "eval_reward": 10.880000054836273, "eval_reward_std": 7.727067697048187, "eval_rewards/accuracy_reward_staging": 0.9605000212788581, "eval_rewards/format_reward": 0.65, "eval_rewards/format_reward_staging": 0.625, "eval_runtime": 120.192, "eval_samples_per_second": 0.166, "eval_steps_per_second": 0.042, "step": 340 }, { "clip_ratio": 0.0, "completion_length": 560.4375, "epoch": 340.8, "grad_norm": 2.062257482287411, "kl": 0.56280517578125, "learning_rate": 1.296443040601003e-07, "loss": 0.0435, "reward": 10.376562610268593, "reward_std": 8.002589859068394, "rewards/accuracy_reward_staging": 0.9189062397927046, "rewards/format_reward": 0.6171875, "rewards/format_reward_staging": 0.5703125, "step": 341 }, { "clip_ratio": 0.0, "completion_length": 539.75, "epoch": 341.8, "grad_norm": 2.2745767425023513, "kl": 0.53656005859375, "learning_rate": 1.2538029286060424e-07, "loss": 0.0623, "reward": 11.88906241953373, "reward_std": 8.809318155050278, "rewards/accuracy_reward_staging": 1.0607812739908695, "rewards/format_reward": 0.671875, "rewards/format_reward_staging": 0.609375, "step": 342 }, { "clip_ratio": 0.0, "completion_length": 537.71875, "epoch": 342.8, "grad_norm": 1.9073471200149112, "kl": 0.48541259765625, "learning_rate": 1.2118288733803472e-07, "loss": 0.1154, "reward": 14.631249979138374, "reward_std": 6.283873476088047, "rewards/accuracy_reward_staging": 1.3256250023841858, "rewards/format_reward": 0.703125, "rewards/format_reward_staging": 0.671875, "step": 343 }, { "clip_ratio": 0.0, "completion_length": 548.6875, "epoch": 343.8, "grad_norm": 2.2191165734186025, "kl": 0.6094970703125, "learning_rate": 1.1705240714107301e-07, "loss": 0.1021, "reward": 9.640625074505806, "reward_std": 8.212526381015778, "rewards/accuracy_reward_staging": 0.8593750139698386, "rewards/format_reward": 0.53125, "rewards/format_reward_staging": 0.515625, "step": 344 }, { "clip_ratio": 0.0, "completion_length": 513.8125, "epoch": 344.8, "grad_norm": 1.9854032388069334, "kl": 0.523681640625, "learning_rate": 1.1298916682177828e-07, "loss": 0.0055, "reward": 14.028124868869781, "reward_std": 6.586119674146175, "rewards/accuracy_reward_staging": 1.2528125252574682, "rewards/format_reward": 0.765625, "rewards/format_reward_staging": 0.734375, "step": 345 }, { "clip_ratio": 0.0, "completion_length": 538.0, "epoch": 345.8, "grad_norm": 1.922770111948013, "kl": 0.4796142578125, "learning_rate": 1.089934758116322e-07, "loss": 0.0353, "reward": 13.531249672174454, "reward_std": 7.330604811664671, "rewards/accuracy_reward_staging": 1.2109374832361937, "rewards/format_reward": 0.71875, "rewards/format_reward_staging": 0.703125, "step": 346 }, { "clip_ratio": 0.0, "completion_length": 556.25, "epoch": 346.8, "grad_norm": 1.9617536727014244, "kl": 0.6065673828125, "learning_rate": 1.05065638397975e-07, "loss": 0.0607, "reward": 10.37343730032444, "reward_std": 7.6566493809223175, "rewards/accuracy_reward_staging": 0.9264062475413084, "rewards/format_reward": 0.5625, "rewards/format_reward_staging": 0.546875, "step": 347 }, { "clip_ratio": 0.0, "completion_length": 514.421875, "epoch": 347.8, "grad_norm": 5.910623715329322, "kl": 0.586669921875, "learning_rate": 1.0120595370083318e-07, "loss": 0.0441, "reward": 10.33906227350235, "reward_std": 5.6453575268387794, "rewards/accuracy_reward_staging": 0.9151562377810478, "rewards/format_reward": 0.578125, "rewards/format_reward_staging": 0.609375, "step": 348 }, { "clip_ratio": 0.0, "completion_length": 505.5625, "epoch": 348.8, "grad_norm": 2.2977218203304983, "kl": 0.5316162109375, "learning_rate": 9.741471565013958e-08, "loss": 0.0605, "reward": 12.892187714576721, "reward_std": 7.930499374866486, "rewards/accuracy_reward_staging": 1.1439062617719173, "rewards/format_reward": 0.75, "rewards/format_reward_staging": 0.703125, "step": 349 }, { "epoch": 349.8, "grad_norm": 1.8861216023880358, "learning_rate": 9.369221296335006e-08, "loss": 0.1235, "step": 350 }, { "epoch": 349.8, "eval_clip_ratio": 0.0, "eval_completion_length": 520.25, "eval_kl": 0.54873046875, "eval_loss": 0.06522668898105621, "eval_reward": 12.468749737739563, "eval_reward_std": 7.662279307842255, "eval_rewards/accuracy_reward_staging": 1.1093750052154063, "eval_rewards/format_reward": 0.725, "eval_rewards/format_reward_staging": 0.65, "eval_runtime": 127.6467, "eval_samples_per_second": 0.157, "eval_steps_per_second": 0.039, "step": 350 }, { "clip_ratio": 0.0, "completion_length": 525.09375, "epoch": 350.8, "grad_norm": 2.5663009167562287, "kl": 0.54559326171875, "learning_rate": 9.003872912345689e-08, "loss": 0.0985, "reward": 10.989843875169754, "reward_std": 7.33668365329504, "rewards/accuracy_reward_staging": 0.9677343829534948, "rewards/format_reward": 0.671875, "rewards/format_reward_staging": 0.640625, "step": 351 }, { "clip_ratio": 0.0, "completion_length": 566.671875, "epoch": 351.8, "grad_norm": 3.306328897391672, "kl": 0.5091552734375, "learning_rate": 8.645454235739902e-08, "loss": 0.11, "reward": 11.37343755364418, "reward_std": 7.7504191398620605, "rewards/accuracy_reward_staging": 1.009218767285347, "rewards/format_reward": 0.671875, "rewards/format_reward_staging": 0.609375, "step": 352 }, { "clip_ratio": 0.0, "completion_length": 525.484375, "epoch": 352.8, "grad_norm": 1.979792175196284, "kl": 0.520751953125, "learning_rate": 8.293992561487595e-08, "loss": 0.034, "reward": 10.399999901652336, "reward_std": 6.3909139558672905, "rewards/accuracy_reward_staging": 0.9212500043213367, "rewards/format_reward": 0.625, "rewards/format_reward_staging": 0.5625, "step": 353 }, { "clip_ratio": 0.0, "completion_length": 505.015625, "epoch": 353.8, "grad_norm": 94.2447409316946, "kl": 2.2459716796875, "learning_rate": 7.949514654755962e-08, "loss": 0.1011, "reward": 10.320312514901161, "reward_std": 6.939793795347214, "rewards/accuracy_reward_staging": 0.9054687642492354, "rewards/format_reward": 0.65625, "rewards/format_reward_staging": 0.609375, "step": 354 }, { "clip_ratio": 0.0, "completion_length": 551.140625, "epoch": 354.8, "grad_norm": 2.0400310680209444, "kl": 0.5252685546875, "learning_rate": 7.612046748871326e-08, "loss": 0.0919, "reward": 10.507812529802322, "reward_std": 6.574328362941742, "rewards/accuracy_reward_staging": 0.9257812593132257, "rewards/format_reward": 0.625, "rewards/format_reward_staging": 0.625, "step": 355 }, { "clip_ratio": 0.0, "completion_length": 509.78125, "epoch": 355.8, "grad_norm": 10.548554530431085, "kl": 0.63427734375, "learning_rate": 7.281614543321269e-08, "loss": 0.0484, "reward": 12.018749877810478, "reward_std": 7.941742122173309, "rewards/accuracy_reward_staging": 1.0721874982118607, "rewards/format_reward": 0.671875, "rewards/format_reward_staging": 0.625, "step": 356 }, { "clip_ratio": 0.0, "completion_length": 496.859375, "epoch": 356.8, "grad_norm": 2.9275259721912263, "kl": 0.5072021484375, "learning_rate": 6.958243201797553e-08, "loss": 0.0034, "reward": 13.354687303304672, "reward_std": 6.157866388559341, "rewards/accuracy_reward_staging": 1.1901562474668026, "rewards/format_reward": 0.71875, "rewards/format_reward_staging": 0.734375, "step": 357 }, { "clip_ratio": 0.0, "completion_length": 497.078125, "epoch": 357.8, "grad_norm": 3.259670897497323, "kl": 0.56689453125, "learning_rate": 6.641957350279837e-08, "loss": 0.1212, "reward": 10.450000002980232, "reward_std": 8.285482600331306, "rewards/accuracy_reward_staging": 0.9262500102631748, "rewards/format_reward": 0.59375, "rewards/format_reward_staging": 0.59375, "step": 358 }, { "clip_ratio": 0.0, "completion_length": 586.21875, "epoch": 358.8, "grad_norm": 3.6157884851011115, "kl": 0.7269287109375, "learning_rate": 6.332781075160243e-08, "loss": 0.1011, "reward": 8.88906255364418, "reward_std": 8.714880511164665, "rewards/accuracy_reward_staging": 0.7857812475413084, "rewards/format_reward": 0.53125, "rewards/format_reward_staging": 0.5, "step": 359 }, { "epoch": 359.8, "grad_norm": 3.272297448604345, "learning_rate": 6.030737921409168e-08, "loss": 0.0793, "step": 360 }, { "epoch": 359.8, "eval_clip_ratio": 0.0, "eval_completion_length": 534.5875, "eval_kl": 0.5283203125, "eval_loss": 0.10396303236484528, "eval_reward": 10.23999993801117, "eval_reward_std": 7.775251030921936, "eval_rewards/accuracy_reward_staging": 0.9027499988675117, "eval_rewards/format_reward": 0.6375, "eval_rewards/format_reward_staging": 0.575, "eval_runtime": 125.3052, "eval_samples_per_second": 0.16, "eval_steps_per_second": 0.04, "step": 360 }, { "clip_ratio": 0.0, "completion_length": 529.671875, "epoch": 360.8, "grad_norm": 2.443851018491839, "kl": 0.60870361328125, "learning_rate": 5.735850890782157e-08, "loss": 0.0695, "reward": 11.39765627682209, "reward_std": 7.690338987857103, "rewards/accuracy_reward_staging": 1.0139843788929284, "rewards/format_reward": 0.6328125, "rewards/format_reward_staging": 0.625, "step": 361 }, { "clip_ratio": 0.0, "completion_length": 525.890625, "epoch": 361.8, "grad_norm": 4.571161669605146, "kl": 0.7027587890625, "learning_rate": 5.448142440068315e-08, "loss": 0.0926, "reward": 12.457812368869781, "reward_std": 8.532767742872238, "rewards/accuracy_reward_staging": 1.097343759611249, "rewards/format_reward": 0.765625, "rewards/format_reward_staging": 0.71875, "step": 362 }, { "clip_ratio": 0.0, "completion_length": 540.40625, "epoch": 362.8, "grad_norm": 2.6127548669423333, "kl": 0.56591796875, "learning_rate": 5.1676344793800675e-08, "loss": 0.1056, "reward": 10.995312467217445, "reward_std": 7.668447345495224, "rewards/accuracy_reward_staging": 0.9682812597602606, "rewards/format_reward": 0.65625, "rewards/format_reward_staging": 0.65625, "step": 363 }, { "clip_ratio": 0.0, "completion_length": 518.546875, "epoch": 363.8, "grad_norm": 2.0552105230802944, "kl": 0.52197265625, "learning_rate": 4.8943483704846465e-08, "loss": 0.1321, "reward": 11.596875041723251, "reward_std": 8.213591203093529, "rewards/accuracy_reward_staging": 1.0190625078976154, "rewards/format_reward": 0.71875, "rewards/format_reward_staging": 0.6875, "step": 364 }, { "clip_ratio": 0.0, "completion_length": 505.765625, "epoch": 364.8, "grad_norm": 2.326851393730122, "kl": 0.5279541015625, "learning_rate": 4.6283049251773176e-08, "loss": 0.0553, "reward": 11.739062711596489, "reward_std": 7.672401025891304, "rewards/accuracy_reward_staging": 1.0348437502980232, "rewards/format_reward": 0.75, "rewards/format_reward_staging": 0.640625, "step": 365 }, { "clip_ratio": 0.0, "completion_length": 549.078125, "epoch": 365.8, "grad_norm": 5.111208968520272, "kl": 0.6072998046875, "learning_rate": 4.3695244036964564e-08, "loss": 0.082, "reward": 9.354687571525574, "reward_std": 7.511838540434837, "rewards/accuracy_reward_staging": 0.8167187636718154, "rewards/format_reward": 0.609375, "rewards/format_reward_staging": 0.578125, "step": 366 }, { "clip_ratio": 0.0, "completion_length": 517.296875, "epoch": 366.8, "grad_norm": 2.581279648987323, "kl": 0.5887451171875, "learning_rate": 4.1180265131806946e-08, "loss": 0.0205, "reward": 10.934374988079071, "reward_std": 8.458094909787178, "rewards/accuracy_reward_staging": 0.9778125118464231, "rewards/format_reward": 0.59375, "rewards/format_reward_staging": 0.5625, "step": 367 }, { "clip_ratio": 0.0, "completion_length": 518.40625, "epoch": 367.8, "grad_norm": 2.035652649324917, "kl": 0.569091796875, "learning_rate": 3.87383040616811e-08, "loss": 0.0615, "reward": 11.773437261581421, "reward_std": 7.848160028457642, "rewards/accuracy_reward_staging": 1.0507812481373549, "rewards/format_reward": 0.65625, "rewards/format_reward_staging": 0.609375, "step": 368 }, { "clip_ratio": 0.0, "completion_length": 568.703125, "epoch": 368.8, "grad_norm": 2.8704472982245526, "kl": 0.57958984375, "learning_rate": 3.636954679137705e-08, "loss": 0.081, "reward": 9.676562517881393, "reward_std": 8.41864463686943, "rewards/accuracy_reward_staging": 0.8692187499254942, "rewards/format_reward": 0.515625, "rewards/format_reward_staging": 0.46875, "step": 369 }, { "epoch": 369.8, "grad_norm": 8.688192162327972, "learning_rate": 3.4074173710931796e-08, "loss": 0.092, "step": 370 }, { "epoch": 369.8, "eval_clip_ratio": 0.0, "eval_completion_length": 537.7875, "eval_kl": 0.5083984375, "eval_loss": 0.07736475765705109, "eval_reward": 11.122499930858613, "eval_reward_std": 6.3223115285858515, "eval_rewards/accuracy_reward_staging": 0.989750000461936, "eval_rewards/format_reward": 0.6375, "eval_rewards/format_reward_staging": 0.5875, "eval_runtime": 137.4717, "eval_samples_per_second": 0.145, "eval_steps_per_second": 0.036, "step": 370 }, { "clip_ratio": 0.0, "completion_length": 548.3203125, "epoch": 370.8, "grad_norm": 2.9935915085575524, "kl": 0.7093505859375, "learning_rate": 3.185235962189237e-08, "loss": 0.086, "reward": 10.013281270861626, "reward_std": 7.9125730618834496, "rewards/accuracy_reward_staging": 0.892734372522682, "rewards/format_reward": 0.5625, "rewards/format_reward_staging": 0.5234375, "step": 371 }, { "clip_ratio": 0.0, "completion_length": 507.0, "epoch": 371.8, "grad_norm": 2.2371928274108455, "kl": 0.517822265625, "learning_rate": 2.9704273724003526e-08, "loss": 0.0676, "reward": 11.292187675833702, "reward_std": 6.194611236453056, "rewards/accuracy_reward_staging": 1.0042187473736703, "rewards/format_reward": 0.65625, "rewards/format_reward_staging": 0.59375, "step": 372 }, { "clip_ratio": 0.0, "completion_length": 559.84375, "epoch": 372.8, "grad_norm": 2.531647136891463, "kl": 0.52001953125, "learning_rate": 2.7630079602323443e-08, "loss": 0.1587, "reward": 10.668749883770943, "reward_std": 8.836217179894447, "rewards/accuracy_reward_staging": 0.9575000163167715, "rewards/format_reward": 0.546875, "rewards/format_reward_staging": 0.546875, "step": 373 }, { "clip_ratio": 0.0, "completion_length": 503.234375, "epoch": 373.8, "grad_norm": 1.8905690009692866, "kl": 0.56494140625, "learning_rate": 2.5629935214764864e-08, "loss": 0.0793, "reward": 10.550000071525574, "reward_std": 7.391293793916702, "rewards/accuracy_reward_staging": 0.931562501937151, "rewards/format_reward": 0.640625, "rewards/format_reward_staging": 0.59375, "step": 374 }, { "clip_ratio": 0.0, "completion_length": 518.109375, "epoch": 374.8, "grad_norm": 4.31839552805722, "kl": 0.7208251953125, "learning_rate": 2.3703992880066636e-08, "loss": 0.0576, "reward": 11.893750101327896, "reward_std": 7.522340267896652, "rewards/accuracy_reward_staging": 1.0612500254064798, "rewards/format_reward": 0.65625, "rewards/format_reward_staging": 0.625, "step": 375 }, { "clip_ratio": 0.0, "completion_length": 560.953125, "epoch": 375.8, "grad_norm": 2.2893855348554824, "kl": 0.4915771484375, "learning_rate": 2.185239926619431e-08, "loss": 0.0645, "reward": 11.574999958276749, "reward_std": 8.68027800321579, "rewards/accuracy_reward_staging": 1.0246875025331974, "rewards/format_reward": 0.671875, "rewards/format_reward_staging": 0.65625, "step": 376 }, { "clip_ratio": 0.0, "completion_length": 503.046875, "epoch": 376.8, "grad_norm": 2.0521716339261573, "kl": 0.5130615234375, "learning_rate": 2.007529537917041e-08, "loss": 0.0747, "reward": 11.942187368869781, "reward_std": 6.6959647461771965, "rewards/accuracy_reward_staging": 1.0660937773063779, "rewards/format_reward": 0.640625, "rewards/format_reward_staging": 0.640625, "step": 377 }, { "clip_ratio": 0.0, "completion_length": 544.703125, "epoch": 377.8, "grad_norm": 2.6744855588179597, "kl": 0.6314697265625, "learning_rate": 1.8372816552336023e-08, "loss": 0.0892, "reward": 10.899999856948853, "reward_std": 7.818521216511726, "rewards/accuracy_reward_staging": 0.957187520340085, "rewards/format_reward": 0.6875, "rewards/format_reward_staging": 0.640625, "step": 378 }, { "clip_ratio": 0.0, "completion_length": 536.0, "epoch": 378.8, "grad_norm": 2.1993782801564827, "kl": 0.5281982421875, "learning_rate": 1.6745092436045492e-08, "loss": 0.09, "reward": 10.765625104308128, "reward_std": 9.52804271876812, "rewards/accuracy_reward_staging": 0.9468750059604645, "rewards/format_reward": 0.671875, "rewards/format_reward_staging": 0.625, "step": 379 }, { "epoch": 379.8, "grad_norm": 2.850728275624988, "learning_rate": 1.519224698779198e-08, "loss": 0.0848, "step": 380 }, { "epoch": 379.8, "eval_clip_ratio": 0.0, "eval_completion_length": 504.675, "eval_kl": 0.4578125, "eval_loss": 0.07416948676109314, "eval_reward": 11.396249985694885, "eval_reward_std": 8.601353228092194, "eval_rewards/accuracy_reward_staging": 1.0146250143647193, "eval_rewards/format_reward": 0.6375, "eval_rewards/format_reward_staging": 0.6125, "eval_runtime": 121.4493, "eval_samples_per_second": 0.165, "eval_steps_per_second": 0.041, "step": 380 }, { "clip_ratio": 0.0, "completion_length": 510.578125, "epoch": 380.8, "grad_norm": 2.9736298141622295, "kl": 0.55657958984375, "learning_rate": 1.3714398462768562e-08, "loss": 0.0869, "reward": 12.212500043213367, "reward_std": 7.73147202283144, "rewards/accuracy_reward_staging": 1.0837499964982271, "rewards/format_reward": 0.6953125, "rewards/format_reward_staging": 0.6796875, "step": 381 }, { "clip_ratio": 0.0, "completion_length": 522.1875, "epoch": 381.8, "grad_norm": 2.0373989754015125, "kl": 0.5072021484375, "learning_rate": 1.231165940486234e-08, "loss": 0.034, "reward": 12.49843743443489, "reward_std": 6.0733470767736435, "rewards/accuracy_reward_staging": 1.1014062575995922, "rewards/format_reward": 0.734375, "rewards/format_reward_staging": 0.75, "step": 382 }, { "clip_ratio": 0.0, "completion_length": 498.09375, "epoch": 382.8, "grad_norm": 2.0529799885966997, "kl": 0.5218505859375, "learning_rate": 1.0984136638083175e-08, "loss": 0.0674, "reward": 12.098437368869781, "reward_std": 7.62692953646183, "rewards/accuracy_reward_staging": 1.0817187586799264, "rewards/format_reward": 0.640625, "rewards/format_reward_staging": 0.640625, "step": 383 }, { "clip_ratio": 0.0, "completion_length": 523.203125, "epoch": 383.8, "grad_norm": 2.0301601145066495, "kl": 0.521240234375, "learning_rate": 9.731931258429638e-09, "loss": 0.059, "reward": 10.729687452316284, "reward_std": 5.984116218984127, "rewards/accuracy_reward_staging": 0.9464062480255961, "rewards/format_reward": 0.671875, "rewards/format_reward_staging": 0.59375, "step": 384 }, { "clip_ratio": 0.0, "completion_length": 502.109375, "epoch": 384.8, "grad_norm": 4.517612366443085, "kl": 0.602294921875, "learning_rate": 8.555138626189618e-09, "loss": 0.0919, "reward": 10.614062532782555, "reward_std": 7.197298094630241, "rewards/accuracy_reward_staging": 0.9317187555134296, "rewards/format_reward": 0.65625, "rewards/format_reward_staging": 0.640625, "step": 385 }, { "clip_ratio": 0.0, "completion_length": 531.5625, "epoch": 385.8, "grad_norm": 1.996721078816804, "kl": 0.542236328125, "learning_rate": 7.453848358678017e-09, "loss": 0.0893, "reward": 12.423437505960464, "reward_std": 7.174239456653595, "rewards/accuracy_reward_staging": 1.0970312654972076, "rewards/format_reward": 0.765625, "rewards/format_reward_staging": 0.6875, "step": 386 }, { "clip_ratio": 0.0, "completion_length": 529.5, "epoch": 386.8, "grad_norm": 1.6692450152429934, "kl": 0.4757080078125, "learning_rate": 6.4281443234125434e-09, "loss": 0.0622, "reward": 10.457812458276749, "reward_std": 8.408779114484787, "rewards/accuracy_reward_staging": 0.9207812584936619, "rewards/format_reward": 0.6875, "rewards/format_reward_staging": 0.5625, "step": 387 }, { "clip_ratio": 0.0, "completion_length": 516.28125, "epoch": 387.8, "grad_norm": 72.65003366147363, "kl": 1.12158203125, "learning_rate": 5.47810463172671e-09, "loss": 0.0677, "reward": 14.623437404632568, "reward_std": 7.604200206696987, "rewards/accuracy_reward_staging": 1.3154687583446503, "rewards/format_reward": 0.75, "rewards/format_reward_staging": 0.71875, "step": 388 }, { "clip_ratio": 0.0, "completion_length": 540.546875, "epoch": 388.8, "grad_norm": 2.13782688418701, "kl": 0.5604248046875, "learning_rate": 4.603801632821147e-09, "loss": 0.0741, "reward": 11.571875154972076, "reward_std": 8.498483955860138, "rewards/accuracy_reward_staging": 1.0556250140070915, "rewards/format_reward": 0.515625, "rewards/format_reward_staging": 0.5, "step": 389 }, { "epoch": 389.8, "grad_norm": 2.550655354845835, "learning_rate": 3.805301908254455e-09, "loss": 0.0785, "step": 390 }, { "epoch": 389.8, "eval_clip_ratio": 0.0, "eval_completion_length": 543.7, "eval_kl": 0.77880859375, "eval_loss": 0.05070864409208298, "eval_reward": 11.214999914169312, "eval_reward_std": 7.508172661066055, "eval_rewards/accuracy_reward_staging": 1.0102500110864638, "eval_rewards/format_reward": 0.575, "eval_rewards/format_reward_staging": 0.5375, "eval_runtime": 147.1522, "eval_samples_per_second": 0.136, "eval_steps_per_second": 0.034, "step": 390 }, { "clip_ratio": 0.0, "completion_length": 514.140625, "epoch": 390.8, "grad_norm": 2.7881900199384537, "kl": 0.58929443359375, "learning_rate": 3.082666266872036e-09, "loss": 0.0336, "reward": 12.8234374076128, "reward_std": 7.5515576638281345, "rewards/accuracy_reward_staging": 1.1495312573388219, "rewards/format_reward": 0.6640625, "rewards/format_reward_staging": 0.6640625, "step": 391 }, { "clip_ratio": 0.0, "completion_length": 524.25, "epoch": 391.8, "grad_norm": 2.166883481802826, "kl": 0.585693359375, "learning_rate": 2.435949740175802e-09, "loss": 0.0693, "reward": 9.915624856948853, "reward_std": 7.9057832062244415, "rewards/accuracy_reward_staging": 0.8790625054389238, "rewards/format_reward": 0.578125, "rewards/format_reward_staging": 0.546875, "step": 392 }, { "clip_ratio": 0.0, "completion_length": 558.03125, "epoch": 392.8, "grad_norm": 2.013874424977512, "kl": 0.5469970703125, "learning_rate": 1.86520157813308e-09, "loss": -0.0047, "reward": 10.384374856948853, "reward_std": 7.076003402471542, "rewards/accuracy_reward_staging": 0.9103124821558595, "rewards/format_reward": 0.65625, "rewards/format_reward_staging": 0.625, "step": 393 }, { "clip_ratio": 0.0, "completion_length": 573.859375, "epoch": 393.8, "grad_norm": 4.782888144712166, "kl": 0.7337646484375, "learning_rate": 1.3704652454261667e-09, "loss": 0.0785, "reward": 10.860937401652336, "reward_std": 8.005559802055359, "rewards/accuracy_reward_staging": 0.9782812423072755, "rewards/format_reward": 0.546875, "rewards/format_reward_staging": 0.53125, "step": 394 }, { "clip_ratio": 0.0, "completion_length": 509.453125, "epoch": 394.8, "grad_norm": 5.775714842908385, "kl": 0.88623046875, "learning_rate": 9.517784181422018e-10, "loss": 0.0558, "reward": 10.265624895691872, "reward_std": 7.404541149735451, "rewards/accuracy_reward_staging": 0.9109375043772161, "rewards/format_reward": 0.609375, "rewards/format_reward_staging": 0.546875, "step": 395 }, { "clip_ratio": 0.0, "completion_length": 543.453125, "epoch": 395.8, "grad_norm": 2.0434917013326266, "kl": 0.5186767578125, "learning_rate": 6.091729809042379e-10, "loss": 0.0542, "reward": 11.81874991953373, "reward_std": 9.100275874137878, "rewards/accuracy_reward_staging": 1.0662500150501728, "rewards/format_reward": 0.609375, "rewards/format_reward_staging": 0.546875, "step": 396 }, { "clip_ratio": 0.0, "completion_length": 546.1875, "epoch": 396.8, "grad_norm": 1.7418802984126296, "kl": 0.4542236328125, "learning_rate": 3.426750244427401e-10, "loss": 0.07, "reward": 12.928124815225601, "reward_std": 7.5549889877438545, "rewards/accuracy_reward_staging": 1.1490625096485019, "rewards/format_reward": 0.71875, "rewards/format_reward_staging": 0.71875, "step": 397 }, { "clip_ratio": 0.0, "completion_length": 492.109375, "epoch": 397.8, "grad_norm": 3.724982054081198, "kl": 0.7119140625, "learning_rate": 1.5230484360873042e-10, "loss": 0.0322, "reward": 12.485937371850014, "reward_std": 7.725762560963631, "rewards/accuracy_reward_staging": 1.1079687606543303, "rewards/format_reward": 0.734375, "rewards/format_reward_staging": 0.671875, "step": 398 }, { "clip_ratio": 0.0, "completion_length": 550.203125, "epoch": 398.8, "grad_norm": 7.1968629953818315, "kl": 0.607421875, "learning_rate": 3.8076935828690315e-11, "loss": 0.1134, "reward": 9.825000047683716, "reward_std": 8.716731041669846, "rewards/accuracy_reward_staging": 0.8637500181794167, "rewards/format_reward": 0.640625, "rewards/format_reward_staging": 0.546875, "step": 399 }, { "epoch": 399.8, "grad_norm": 2.578825219229897, "learning_rate": 0.0, "loss": 0.0625, "step": 400 }, { "epoch": 399.8, "eval_clip_ratio": 0.0, "eval_completion_length": 564.25, "eval_kl": 0.541796875, "eval_loss": 0.1237805113196373, "eval_reward": 12.454999792575837, "eval_reward_std": 7.5952778339385985, "eval_rewards/accuracy_reward_staging": 1.1054999977350235, "eval_rewards/format_reward": 0.7125, "eval_rewards/format_reward_staging": 0.6875, "eval_runtime": 144.5952, "eval_samples_per_second": 0.138, "eval_steps_per_second": 0.035, "step": 400 }, { "clip_ratio": 0.0, "completion_length": 498.03125, "epoch": 399.8, "kl": 0.4832763671875, "reward": 14.412499785423279, "reward_std": 7.106586746871471, "rewards/accuracy_reward_staging": 1.294375006109476, "rewards/format_reward": 0.734375, "rewards/format_reward_staging": 0.734375, "step": 400, "total_flos": 0.0, "train_loss": 1.0999555667603271, "train_runtime": 38247.6092, "train_samples_per_second": 0.209, "train_steps_per_second": 0.01 } ], "logging_steps": 1, "max_steps": 400, "num_input_tokens_seen": 0, "num_train_epochs": 400, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }