| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 399.8, | |
| "eval_steps": 10, | |
| "global_step": 400, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 606.59375, | |
| "epoch": 0.8, | |
| "grad_norm": 1.3200632494048739, | |
| "kl": 0.0, | |
| "learning_rate": 5e-08, | |
| "loss": 0.043, | |
| "reward": 11.62500011920929, | |
| "reward_std": 5.327881373465061, | |
| "rewards/accuracy_reward_staging": 0.9671875108033419, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/format_reward_staging": 0.984375, | |
| "step": 1 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 622.75, | |
| "epoch": 1.8, | |
| "grad_norm": 1.245967192961327, | |
| "kl": 0.0, | |
| "learning_rate": 1e-07, | |
| "loss": 0.0103, | |
| "reward": 11.717187702655792, | |
| "reward_std": 5.550888277590275, | |
| "rewards/accuracy_reward_staging": 0.973281254991889, | |
| "rewards/format_reward": 0.984375, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 2 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 562.15625, | |
| "epoch": 2.8, | |
| "grad_norm": 1.3598583264581219, | |
| "kl": 0.0012388229370117188, | |
| "learning_rate": 1.5e-07, | |
| "loss": 0.0066, | |
| "reward": 11.326562702655792, | |
| "reward_std": 4.338181830942631, | |
| "rewards/accuracy_reward_staging": 0.935781279578805, | |
| "rewards/format_reward": 0.984375, | |
| "rewards/format_reward_staging": 0.984375, | |
| "step": 3 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 633.84375, | |
| "epoch": 3.8, | |
| "grad_norm": 1.2847021449800877, | |
| "kl": 0.001153707504272461, | |
| "learning_rate": 2e-07, | |
| "loss": 0.0018, | |
| "reward": 11.992187529802322, | |
| "reward_std": 5.119553402066231, | |
| "rewards/accuracy_reward_staging": 1.0039062658324838, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/format_reward_staging": 0.984375, | |
| "step": 4 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 617.796875, | |
| "epoch": 4.8, | |
| "grad_norm": 1.3189665452513488, | |
| "kl": 0.0013086795806884766, | |
| "learning_rate": 2.5e-07, | |
| "loss": 0.085, | |
| "reward": 10.701562702655792, | |
| "reward_std": 5.677764259278774, | |
| "rewards/accuracy_reward_staging": 0.8795312605798244, | |
| "rewards/format_reward": 0.9375, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 5 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 610.96875, | |
| "epoch": 5.8, | |
| "grad_norm": 1.2985032171428867, | |
| "kl": 0.001140594482421875, | |
| "learning_rate": 3e-07, | |
| "loss": -0.0194, | |
| "reward": 11.771875321865082, | |
| "reward_std": 4.783194027841091, | |
| "rewards/accuracy_reward_staging": 0.9771875143051147, | |
| "rewards/format_reward": 1.0, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 6 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 601.046875, | |
| "epoch": 6.8, | |
| "grad_norm": 1.3341628850896736, | |
| "kl": 0.0011951923370361328, | |
| "learning_rate": 3.5e-07, | |
| "loss": -0.0596, | |
| "reward": 10.256250113248825, | |
| "reward_std": 4.867078542709351, | |
| "rewards/accuracy_reward_staging": 0.827187517657876, | |
| "rewards/format_reward": 0.984375, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 7 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 606.59375, | |
| "epoch": 7.8, | |
| "grad_norm": 1.285368092332121, | |
| "kl": 0.0010988712310791016, | |
| "learning_rate": 4e-07, | |
| "loss": -0.0081, | |
| "reward": 9.478125274181366, | |
| "reward_std": 3.7967969875317067, | |
| "rewards/accuracy_reward_staging": 0.7493750108405948, | |
| "rewards/format_reward": 1.0, | |
| "rewards/format_reward_staging": 0.984375, | |
| "step": 8 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 622.796875, | |
| "epoch": 8.8, | |
| "grad_norm": 1.2619218024838643, | |
| "kl": 0.0011816024780273438, | |
| "learning_rate": 4.5e-07, | |
| "loss": 0.022, | |
| "reward": 9.204687714576721, | |
| "reward_std": 4.6997692584991455, | |
| "rewards/accuracy_reward_staging": 0.7235937705263495, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 9.8, | |
| "grad_norm": 1.3090135878224105, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0552, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 9.8, | |
| "eval_clip_ratio": 0.0, | |
| "eval_completion_length": 598.6, | |
| "eval_kl": 0.001386260986328125, | |
| "eval_loss": -0.007845744490623474, | |
| "eval_reward": 10.647500252723693, | |
| "eval_reward_std": 5.141342180967331, | |
| "eval_rewards/accuracy_reward_staging": 0.8647500067949295, | |
| "eval_rewards/format_reward": 1.0, | |
| "eval_rewards/format_reward_staging": 1.0, | |
| "eval_runtime": 128.7421, | |
| "eval_samples_per_second": 0.155, | |
| "eval_steps_per_second": 0.039, | |
| "step": 10 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 616.1640625, | |
| "epoch": 10.8, | |
| "grad_norm": 1.2259380562812106, | |
| "kl": 0.0011686086654663086, | |
| "learning_rate": 5.5e-07, | |
| "loss": 0.021, | |
| "reward": 10.721875354647636, | |
| "reward_std": 4.659499041736126, | |
| "rewards/accuracy_reward_staging": 0.8792187599465251, | |
| "rewards/format_reward": 0.9609375, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 11 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 614.578125, | |
| "epoch": 11.8, | |
| "grad_norm": 1.1553077299419614, | |
| "kl": 0.0011510848999023438, | |
| "learning_rate": 6e-07, | |
| "loss": -0.025, | |
| "reward": 10.503125250339508, | |
| "reward_std": 5.1386475414037704, | |
| "rewards/accuracy_reward_staging": 0.8581250142306089, | |
| "rewards/format_reward": 0.953125, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 12 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 612.90625, | |
| "epoch": 12.8, | |
| "grad_norm": 1.3576949395163465, | |
| "kl": 0.0011174678802490234, | |
| "learning_rate": 6.5e-07, | |
| "loss": -0.0006, | |
| "reward": 10.643750101327896, | |
| "reward_std": 4.954892493784428, | |
| "rewards/accuracy_reward_staging": 0.8737500086426735, | |
| "rewards/format_reward": 0.953125, | |
| "rewards/format_reward_staging": 0.953125, | |
| "step": 13 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 628.53125, | |
| "epoch": 13.8, | |
| "grad_norm": 1.2887380340653378, | |
| "kl": 0.0011186599731445312, | |
| "learning_rate": 7e-07, | |
| "loss": -0.0706, | |
| "reward": 8.992187768220901, | |
| "reward_std": 4.08132154494524, | |
| "rewards/accuracy_reward_staging": 0.7023437591269612, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 14 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 655.21875, | |
| "epoch": 14.8, | |
| "grad_norm": 1.3384872721535677, | |
| "kl": 0.0011413097381591797, | |
| "learning_rate": 7.5e-07, | |
| "loss": -0.0176, | |
| "reward": 10.270312935113907, | |
| "reward_std": 5.108375668525696, | |
| "rewards/accuracy_reward_staging": 0.837968748062849, | |
| "rewards/format_reward": 0.9375, | |
| "rewards/format_reward_staging": 0.953125, | |
| "step": 15 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 620.21875, | |
| "epoch": 15.8, | |
| "grad_norm": 1.3078665462402237, | |
| "kl": 0.0012836456298828125, | |
| "learning_rate": 8e-07, | |
| "loss": -0.0292, | |
| "reward": 10.915625095367432, | |
| "reward_std": 5.460881970822811, | |
| "rewards/accuracy_reward_staging": 0.8946875166147947, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 16 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 619.3125, | |
| "epoch": 16.8, | |
| "grad_norm": 1.3598193439821062, | |
| "kl": 0.0014238357543945312, | |
| "learning_rate": 8.499999999999999e-07, | |
| "loss": 0.008, | |
| "reward": 10.659375220537186, | |
| "reward_std": 5.2481329292058945, | |
| "rewards/accuracy_reward_staging": 0.8690625205636024, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 17 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 649.84375, | |
| "epoch": 17.8, | |
| "grad_norm": 1.3496406035053183, | |
| "kl": 0.0015878677368164062, | |
| "learning_rate": 9e-07, | |
| "loss": 0.0445, | |
| "reward": 10.225000023841858, | |
| "reward_std": 4.735325090587139, | |
| "rewards/accuracy_reward_staging": 0.8225000277161598, | |
| "rewards/format_reward": 1.0, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 18 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 572.265625, | |
| "epoch": 18.8, | |
| "grad_norm": 1.3727510323236323, | |
| "kl": 0.0017957687377929688, | |
| "learning_rate": 9.499999999999999e-07, | |
| "loss": 0.0497, | |
| "reward": 9.406250178813934, | |
| "reward_std": 4.422982223331928, | |
| "rewards/accuracy_reward_staging": 0.7437500189989805, | |
| "rewards/format_reward": 0.984375, | |
| "rewards/format_reward_staging": 0.984375, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 19.8, | |
| "grad_norm": 1.2508053733094389, | |
| "learning_rate": 1e-06, | |
| "loss": -0.0193, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 19.8, | |
| "eval_clip_ratio": 0.0, | |
| "eval_completion_length": 644.85, | |
| "eval_kl": 0.0017383575439453125, | |
| "eval_loss": 0.013418617658317089, | |
| "eval_reward": 12.118750143051148, | |
| "eval_reward_std": 5.42993243932724, | |
| "eval_rewards/accuracy_reward_staging": 1.0181250065565108, | |
| "eval_rewards/format_reward": 0.9375, | |
| "eval_rewards/format_reward_staging": 1.0, | |
| "eval_runtime": 138.2951, | |
| "eval_samples_per_second": 0.145, | |
| "eval_steps_per_second": 0.036, | |
| "step": 20 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 618.1875, | |
| "epoch": 20.8, | |
| "grad_norm": 1.2769652330197667, | |
| "kl": 0.0017538070678710938, | |
| "learning_rate": 1.05e-06, | |
| "loss": 0.0169, | |
| "reward": 10.289843887090683, | |
| "reward_std": 4.1728136613965034, | |
| "rewards/accuracy_reward_staging": 0.8297656457871199, | |
| "rewards/format_reward": 0.9921875, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 21 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 586.828125, | |
| "epoch": 21.8, | |
| "grad_norm": 1.2550140515179407, | |
| "kl": 0.0023107528686523438, | |
| "learning_rate": 1.1e-06, | |
| "loss": 0.0203, | |
| "reward": 10.453125178813934, | |
| "reward_std": 5.255412273108959, | |
| "rewards/accuracy_reward_staging": 0.8484374992549419, | |
| "rewards/format_reward": 0.984375, | |
| "rewards/format_reward_staging": 0.984375, | |
| "step": 22 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 609.78125, | |
| "epoch": 22.8, | |
| "grad_norm": 1.1649225522797726, | |
| "kl": 0.0023360252380371094, | |
| "learning_rate": 1.1499999999999998e-06, | |
| "loss": 0.0301, | |
| "reward": 10.517187863588333, | |
| "reward_std": 4.380151428282261, | |
| "rewards/accuracy_reward_staging": 0.8532812558114529, | |
| "rewards/format_reward": 0.984375, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 23 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 640.1875, | |
| "epoch": 23.8, | |
| "grad_norm": 1.2148463960481974, | |
| "kl": 0.0026197433471679688, | |
| "learning_rate": 1.2e-06, | |
| "loss": 0.0016, | |
| "reward": 10.815625190734863, | |
| "reward_std": 4.644554391503334, | |
| "rewards/accuracy_reward_staging": 0.8893750132992864, | |
| "rewards/format_reward": 0.9375, | |
| "rewards/format_reward_staging": 0.984375, | |
| "step": 24 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 605.65625, | |
| "epoch": 24.8, | |
| "grad_norm": 1.2403401257649775, | |
| "kl": 0.0026373863220214844, | |
| "learning_rate": 1.2499999999999999e-06, | |
| "loss": 0.0521, | |
| "reward": 10.245312750339508, | |
| "reward_std": 4.28605642169714, | |
| "rewards/accuracy_reward_staging": 0.8323437552899122, | |
| "rewards/format_reward": 0.9375, | |
| "rewards/format_reward_staging": 0.984375, | |
| "step": 25 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 598.609375, | |
| "epoch": 25.8, | |
| "grad_norm": 1.3147177916400645, | |
| "kl": 0.0031595230102539062, | |
| "learning_rate": 1.3e-06, | |
| "loss": 0.0414, | |
| "reward": 10.621875256299973, | |
| "reward_std": 5.236618235707283, | |
| "rewards/accuracy_reward_staging": 0.8684375081211329, | |
| "rewards/format_reward": 0.953125, | |
| "rewards/format_reward_staging": 0.984375, | |
| "step": 26 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 625.40625, | |
| "epoch": 26.8, | |
| "grad_norm": 1.2154796498346485, | |
| "kl": 0.003124237060546875, | |
| "learning_rate": 1.35e-06, | |
| "loss": -0.0316, | |
| "reward": 11.971875131130219, | |
| "reward_std": 4.97715250402689, | |
| "rewards/accuracy_reward_staging": 1.0034375116229057, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 27 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 669.484375, | |
| "epoch": 27.8, | |
| "grad_norm": 1.2295951327912775, | |
| "kl": 0.004219532012939453, | |
| "learning_rate": 1.4e-06, | |
| "loss": 0.0328, | |
| "reward": 10.487500160932541, | |
| "reward_std": 4.095494709908962, | |
| "rewards/accuracy_reward_staging": 0.8581250123679638, | |
| "rewards/format_reward": 0.9375, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 28 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 642.875, | |
| "epoch": 28.8, | |
| "grad_norm": 1.322550495751781, | |
| "kl": 0.00588226318359375, | |
| "learning_rate": 1.4499999999999999e-06, | |
| "loss": 0.06, | |
| "reward": 10.221875250339508, | |
| "reward_std": 4.399149507284164, | |
| "rewards/accuracy_reward_staging": 0.8268750105053186, | |
| "rewards/format_reward": 0.984375, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 29.8, | |
| "grad_norm": 1.2977104318293018, | |
| "learning_rate": 1.5e-06, | |
| "loss": 0.0428, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 29.8, | |
| "eval_clip_ratio": 0.0, | |
| "eval_completion_length": 615.9875, | |
| "eval_kl": 0.005646514892578125, | |
| "eval_loss": -0.0016943871742114425, | |
| "eval_reward": 10.84500024318695, | |
| "eval_reward_std": 4.436952286958695, | |
| "eval_rewards/accuracy_reward_staging": 0.8932500079274177, | |
| "eval_rewards/format_reward": 0.925, | |
| "eval_rewards/format_reward_staging": 0.9875, | |
| "eval_runtime": 140.4185, | |
| "eval_samples_per_second": 0.142, | |
| "eval_steps_per_second": 0.036, | |
| "step": 30 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 602.734375, | |
| "epoch": 30.8, | |
| "grad_norm": 1.2642898418324884, | |
| "kl": 0.006984233856201172, | |
| "learning_rate": 1.55e-06, | |
| "loss": -0.0342, | |
| "reward": 10.63593776524067, | |
| "reward_std": 4.29075089469552, | |
| "rewards/accuracy_reward_staging": 0.865937520749867, | |
| "rewards/format_reward": 0.9765625, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 31 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 562.6875, | |
| "epoch": 31.8, | |
| "grad_norm": 1.3685850711260872, | |
| "kl": 0.0068912506103515625, | |
| "learning_rate": 1.6e-06, | |
| "loss": 0.0138, | |
| "reward": 10.934375166893005, | |
| "reward_std": 5.272631503641605, | |
| "rewards/accuracy_reward_staging": 0.8996875174343586, | |
| "rewards/format_reward": 0.984375, | |
| "rewards/format_reward_staging": 0.953125, | |
| "step": 32 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 612.984375, | |
| "epoch": 32.8, | |
| "grad_norm": 1.2202427650766963, | |
| "kl": 0.00725555419921875, | |
| "learning_rate": 1.6499999999999999e-06, | |
| "loss": 0.0152, | |
| "reward": 11.089062750339508, | |
| "reward_std": 5.698997817933559, | |
| "rewards/accuracy_reward_staging": 0.9151562694460154, | |
| "rewards/format_reward": 0.953125, | |
| "rewards/format_reward_staging": 0.984375, | |
| "step": 33 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 637.859375, | |
| "epoch": 33.8, | |
| "grad_norm": 1.238938352076857, | |
| "kl": 0.00881195068359375, | |
| "learning_rate": 1.6999999999999998e-06, | |
| "loss": -0.0175, | |
| "reward": 10.721875220537186, | |
| "reward_std": 4.769842825829983, | |
| "rewards/accuracy_reward_staging": 0.8784375097602606, | |
| "rewards/format_reward": 0.9375, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 34 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 659.125, | |
| "epoch": 34.8, | |
| "grad_norm": 1.2817921497759655, | |
| "kl": 0.009099960327148438, | |
| "learning_rate": 1.75e-06, | |
| "loss": -0.0368, | |
| "reward": 11.725000083446503, | |
| "reward_std": 5.023090958595276, | |
| "rewards/accuracy_reward_staging": 0.9881250187754631, | |
| "rewards/format_reward": 0.90625, | |
| "rewards/format_reward_staging": 0.9375, | |
| "step": 35 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 627.671875, | |
| "epoch": 35.8, | |
| "grad_norm": 1.2427123786378356, | |
| "kl": 0.00952911376953125, | |
| "learning_rate": 1.8e-06, | |
| "loss": 0.0311, | |
| "reward": 10.635937720537186, | |
| "reward_std": 4.218031510710716, | |
| "rewards/accuracy_reward_staging": 0.8682812862098217, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/format_reward_staging": 0.984375, | |
| "step": 36 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 606.109375, | |
| "epoch": 36.8, | |
| "grad_norm": 1.2742370115467172, | |
| "kl": 0.012310028076171875, | |
| "learning_rate": 1.85e-06, | |
| "loss": 0.0282, | |
| "reward": 11.673437774181366, | |
| "reward_std": 3.9488272815942764, | |
| "rewards/accuracy_reward_staging": 0.9720312729477882, | |
| "rewards/format_reward": 0.953125, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 37 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 651.328125, | |
| "epoch": 37.8, | |
| "grad_norm": 1.1726321374661877, | |
| "kl": 0.012788772583007812, | |
| "learning_rate": 1.8999999999999998e-06, | |
| "loss": 0.0163, | |
| "reward": 10.459375202655792, | |
| "reward_std": 4.296897903084755, | |
| "rewards/accuracy_reward_staging": 0.852187518030405, | |
| "rewards/format_reward": 0.953125, | |
| "rewards/format_reward_staging": 0.984375, | |
| "step": 38 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 617.984375, | |
| "epoch": 38.8, | |
| "grad_norm": 1.2806374425181677, | |
| "kl": 0.017009735107421875, | |
| "learning_rate": 1.95e-06, | |
| "loss": -0.0057, | |
| "reward": 10.478125214576721, | |
| "reward_std": 4.7161330208182335, | |
| "rewards/accuracy_reward_staging": 0.8525000084191561, | |
| "rewards/format_reward": 0.953125, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 39.8, | |
| "grad_norm": 1.362092657080068, | |
| "learning_rate": 2e-06, | |
| "loss": -0.0269, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 39.8, | |
| "eval_clip_ratio": 0.0, | |
| "eval_completion_length": 603.9875, | |
| "eval_kl": 0.0187530517578125, | |
| "eval_loss": 0.0036536618135869503, | |
| "eval_reward": 11.611250162124634, | |
| "eval_reward_std": 5.22377119064331, | |
| "eval_rewards/accuracy_reward_staging": 0.9636250138282776, | |
| "eval_rewards/format_reward": 0.975, | |
| "eval_rewards/format_reward_staging": 1.0, | |
| "eval_runtime": 132.1448, | |
| "eval_samples_per_second": 0.151, | |
| "eval_steps_per_second": 0.038, | |
| "step": 40 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 634.0, | |
| "epoch": 40.8, | |
| "grad_norm": 1.2640957862139377, | |
| "kl": 0.018407821655273438, | |
| "learning_rate": 1.999961923064171e-06, | |
| "loss": -0.0634, | |
| "reward": 11.232812687754631, | |
| "reward_std": 5.111758019775152, | |
| "rewards/accuracy_reward_staging": 0.9334375113248825, | |
| "rewards/format_reward": 0.9296875, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 41 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 635.765625, | |
| "epoch": 41.8, | |
| "grad_norm": 1.268606166424927, | |
| "kl": 0.01999664306640625, | |
| "learning_rate": 1.9998476951563913e-06, | |
| "loss": 0.0283, | |
| "reward": 12.45000010728836, | |
| "reward_std": 4.9740989953279495, | |
| "rewards/accuracy_reward_staging": 1.0450000185519457, | |
| "rewards/format_reward": 1.0, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 42 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 594.1875, | |
| "epoch": 42.8, | |
| "grad_norm": 1.3740022650965131, | |
| "kl": 0.020366668701171875, | |
| "learning_rate": 1.999657324975557e-06, | |
| "loss": -0.0149, | |
| "reward": 11.234375149011612, | |
| "reward_std": 5.008681446313858, | |
| "rewards/accuracy_reward_staging": 0.9250000100582838, | |
| "rewards/format_reward": 1.0, | |
| "rewards/format_reward_staging": 0.984375, | |
| "step": 43 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 622.640625, | |
| "epoch": 43.8, | |
| "grad_norm": 1.2026028918578335, | |
| "kl": 0.02169036865234375, | |
| "learning_rate": 1.9993908270190957e-06, | |
| "loss": 0.0018, | |
| "reward": 11.873437762260437, | |
| "reward_std": 4.005194254219532, | |
| "rewards/accuracy_reward_staging": 0.9920312594622374, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/format_reward_staging": 0.984375, | |
| "step": 44 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 637.0625, | |
| "epoch": 44.8, | |
| "grad_norm": 1.281104821624419, | |
| "kl": 0.022716522216796875, | |
| "learning_rate": 1.999048221581858e-06, | |
| "loss": 0.0455, | |
| "reward": 11.17031267285347, | |
| "reward_std": 4.456828519701958, | |
| "rewards/accuracy_reward_staging": 0.9201562497764826, | |
| "rewards/format_reward": 0.984375, | |
| "rewards/format_reward_staging": 0.984375, | |
| "step": 45 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 652.5, | |
| "epoch": 45.8, | |
| "grad_norm": 1.3614150864044823, | |
| "kl": 0.0223236083984375, | |
| "learning_rate": 1.998629534754574e-06, | |
| "loss": 0.0205, | |
| "reward": 10.348437696695328, | |
| "reward_std": 4.60803659260273, | |
| "rewards/accuracy_reward_staging": 0.8426562454551458, | |
| "rewards/format_reward": 0.9375, | |
| "rewards/format_reward_staging": 0.984375, | |
| "step": 46 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 626.359375, | |
| "epoch": 46.8, | |
| "grad_norm": 1.245324461252277, | |
| "kl": 0.0242156982421875, | |
| "learning_rate": 1.9981347984218667e-06, | |
| "loss": 0.0056, | |
| "reward": 13.950000077486038, | |
| "reward_std": 5.2063538283109665, | |
| "rewards/accuracy_reward_staging": 1.2059375159442425, | |
| "rewards/format_reward": 0.9375, | |
| "rewards/format_reward_staging": 0.953125, | |
| "step": 47 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 593.96875, | |
| "epoch": 47.8, | |
| "grad_norm": 1.3313171277926144, | |
| "kl": 0.02922821044921875, | |
| "learning_rate": 1.997564050259824e-06, | |
| "loss": 0.0449, | |
| "reward": 12.739062905311584, | |
| "reward_std": 4.18791925907135, | |
| "rewards/accuracy_reward_staging": 1.075468771159649, | |
| "rewards/format_reward": 0.984375, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 48 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 580.21875, | |
| "epoch": 48.8, | |
| "grad_norm": 1.3629349178288628, | |
| "kl": 0.03372955322265625, | |
| "learning_rate": 1.996917333733128e-06, | |
| "loss": 0.0174, | |
| "reward": 11.535937696695328, | |
| "reward_std": 3.948319137096405, | |
| "rewards/accuracy_reward_staging": 0.9614062653854489, | |
| "rewards/format_reward": 0.953125, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 49.8, | |
| "grad_norm": 1.210177545645759, | |
| "learning_rate": 1.9961946980917456e-06, | |
| "loss": 0.0148, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 49.8, | |
| "eval_clip_ratio": 0.0, | |
| "eval_completion_length": 681.0125, | |
| "eval_kl": 0.0308837890625, | |
| "eval_loss": 0.084346242249012, | |
| "eval_reward": 12.700000238418578, | |
| "eval_reward_std": 4.6470307350158695, | |
| "eval_rewards/accuracy_reward_staging": 1.0800000175833702, | |
| "eval_rewards/format_reward": 0.9375, | |
| "eval_rewards/format_reward_staging": 0.9625, | |
| "eval_runtime": 184.5613, | |
| "eval_samples_per_second": 0.108, | |
| "eval_steps_per_second": 0.027, | |
| "step": 50 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 635.09375, | |
| "epoch": 50.8, | |
| "grad_norm": 1.3066463171281288, | |
| "kl": 0.033504486083984375, | |
| "learning_rate": 1.9953961983671786e-06, | |
| "loss": 0.026, | |
| "reward": 11.925000175833702, | |
| "reward_std": 5.033060222864151, | |
| "rewards/accuracy_reward_staging": 0.9956250190734863, | |
| "rewards/format_reward": 0.9765625, | |
| "rewards/format_reward_staging": 0.9921875, | |
| "step": 51 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 633.59375, | |
| "epoch": 51.8, | |
| "grad_norm": 1.4900533150827393, | |
| "kl": 0.040374755859375, | |
| "learning_rate": 1.994521895368273e-06, | |
| "loss": 0.0244, | |
| "reward": 11.634375095367432, | |
| "reward_std": 4.955964259803295, | |
| "rewards/accuracy_reward_staging": 0.9665625263005495, | |
| "rewards/format_reward": 0.984375, | |
| "rewards/format_reward_staging": 0.984375, | |
| "step": 52 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 642.84375, | |
| "epoch": 52.8, | |
| "grad_norm": 1.3018535470423898, | |
| "kl": 0.0359039306640625, | |
| "learning_rate": 1.9935718556765874e-06, | |
| "loss": 0.0176, | |
| "reward": 13.220312714576721, | |
| "reward_std": 6.300683185458183, | |
| "rewards/accuracy_reward_staging": 1.1282812524586916, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 53 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 625.59375, | |
| "epoch": 53.8, | |
| "grad_norm": 1.1747250249399175, | |
| "kl": 0.0344696044921875, | |
| "learning_rate": 1.992546151641322e-06, | |
| "loss": 0.0279, | |
| "reward": 12.729687660932541, | |
| "reward_std": 3.8526118397712708, | |
| "rewards/accuracy_reward_staging": 1.0854687504470348, | |
| "rewards/format_reward": 0.90625, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 54 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 616.421875, | |
| "epoch": 54.8, | |
| "grad_norm": 1.4193243535352469, | |
| "kl": 0.0395355224609375, | |
| "learning_rate": 1.9914448613738106e-06, | |
| "loss": 0.0064, | |
| "reward": 13.129687666893005, | |
| "reward_std": 5.78121767193079, | |
| "rewards/accuracy_reward_staging": 1.1223437692970037, | |
| "rewards/format_reward": 0.9375, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 55 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 593.609375, | |
| "epoch": 55.8, | |
| "grad_norm": 1.3523573855480968, | |
| "kl": 0.04061126708984375, | |
| "learning_rate": 1.99026806874157e-06, | |
| "loss": 0.0142, | |
| "reward": 13.071875303983688, | |
| "reward_std": 6.487003266811371, | |
| "rewards/accuracy_reward_staging": 1.1118750274181366, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/format_reward_staging": 0.984375, | |
| "step": 56 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 606.71875, | |
| "epoch": 56.8, | |
| "grad_norm": 1.295861523746061, | |
| "kl": 0.04427337646484375, | |
| "learning_rate": 1.989015863361917e-06, | |
| "loss": 0.0139, | |
| "reward": 13.359375238418579, | |
| "reward_std": 6.011465005576611, | |
| "rewards/accuracy_reward_staging": 1.1375000104308128, | |
| "rewards/format_reward": 0.984375, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 57 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 632.140625, | |
| "epoch": 57.8, | |
| "grad_norm": 1.358415139567083, | |
| "kl": 0.0454254150390625, | |
| "learning_rate": 1.9876883405951377e-06, | |
| "loss": 0.0217, | |
| "reward": 12.626562535762787, | |
| "reward_std": 4.465259864926338, | |
| "rewards/accuracy_reward_staging": 1.0657812729477882, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 58 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 598.5, | |
| "epoch": 58.8, | |
| "grad_norm": 1.3223218232049598, | |
| "kl": 0.046783447265625, | |
| "learning_rate": 1.986285601537231e-06, | |
| "loss": 0.0257, | |
| "reward": 12.182812690734863, | |
| "reward_std": 6.196883611381054, | |
| "rewards/accuracy_reward_staging": 1.0292187482118607, | |
| "rewards/format_reward": 0.9375, | |
| "rewards/format_reward_staging": 0.953125, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 59.8, | |
| "grad_norm": 1.2192909123573732, | |
| "learning_rate": 1.984807753012208e-06, | |
| "loss": 0.0537, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 59.8, | |
| "eval_clip_ratio": 0.0, | |
| "eval_completion_length": 641.15, | |
| "eval_kl": 0.04580078125, | |
| "eval_loss": -0.05469979718327522, | |
| "eval_reward": 13.055000233650208, | |
| "eval_reward_std": 5.354214292764664, | |
| "eval_rewards/accuracy_reward_staging": 1.1130000218749045, | |
| "eval_rewards/format_reward": 0.9625, | |
| "eval_rewards/format_reward_staging": 0.9625, | |
| "eval_runtime": 152.2218, | |
| "eval_samples_per_second": 0.131, | |
| "eval_steps_per_second": 0.033, | |
| "step": 60 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 637.5546875, | |
| "epoch": 60.8, | |
| "grad_norm": 1.2273226448891168, | |
| "kl": 0.046173095703125, | |
| "learning_rate": 1.9832549075639547e-06, | |
| "loss": -0.0281, | |
| "reward": 12.067969009280205, | |
| "reward_std": 5.051002878695726, | |
| "rewards/accuracy_reward_staging": 1.0114843952469528, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/format_reward_staging": 0.984375, | |
| "step": 61 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 626.640625, | |
| "epoch": 61.8, | |
| "grad_norm": 1.2972851245460888, | |
| "kl": 0.062713623046875, | |
| "learning_rate": 1.981627183447664e-06, | |
| "loss": 0.0389, | |
| "reward": 12.120312690734863, | |
| "reward_std": 3.9745979011058807, | |
| "rewards/accuracy_reward_staging": 1.0229687616229057, | |
| "rewards/format_reward": 0.9375, | |
| "rewards/format_reward_staging": 0.953125, | |
| "step": 62 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 669.671875, | |
| "epoch": 62.8, | |
| "grad_norm": 1.433658602561295, | |
| "kl": 0.05224609375, | |
| "learning_rate": 1.9799247046208295e-06, | |
| "loss": 0.0548, | |
| "reward": 13.040625154972076, | |
| "reward_std": 5.295711062848568, | |
| "rewards/accuracy_reward_staging": 1.1118750162422657, | |
| "rewards/format_reward": 0.953125, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 63 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 618.8125, | |
| "epoch": 63.8, | |
| "grad_norm": 1.2699526702646982, | |
| "kl": 0.0532379150390625, | |
| "learning_rate": 1.9781476007338054e-06, | |
| "loss": 0.0405, | |
| "reward": 12.978125303983688, | |
| "reward_std": 5.858064912259579, | |
| "rewards/accuracy_reward_staging": 1.100937519222498, | |
| "rewards/format_reward": 0.984375, | |
| "rewards/format_reward_staging": 0.984375, | |
| "step": 64 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 622.25, | |
| "epoch": 64.8, | |
| "grad_norm": 1.2887790245964135, | |
| "kl": 0.0642242431640625, | |
| "learning_rate": 1.976296007119933e-06, | |
| "loss": 0.0309, | |
| "reward": 13.806250274181366, | |
| "reward_std": 5.3699341379106045, | |
| "rewards/accuracy_reward_staging": 1.1900000236928463, | |
| "rewards/format_reward": 0.9375, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 65 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 626.75, | |
| "epoch": 65.8, | |
| "grad_norm": 1.2658566344825595, | |
| "kl": 0.0558319091796875, | |
| "learning_rate": 1.9743700647852355e-06, | |
| "loss": -0.0173, | |
| "reward": 12.885937601327896, | |
| "reward_std": 5.130606591701508, | |
| "rewards/accuracy_reward_staging": 1.0917187500745058, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 66 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 603.84375, | |
| "epoch": 66.8, | |
| "grad_norm": 1.2421251352393117, | |
| "kl": 0.0610198974609375, | |
| "learning_rate": 1.9723699203976766e-06, | |
| "loss": 0.0279, | |
| "reward": 12.806250214576721, | |
| "reward_std": 4.9495924392249435, | |
| "rewards/accuracy_reward_staging": 1.0806250050663948, | |
| "rewards/format_reward": 1.0, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 67 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 610.90625, | |
| "epoch": 67.8, | |
| "grad_norm": 1.2743636487669978, | |
| "kl": 0.07061767578125, | |
| "learning_rate": 1.9702957262759963e-06, | |
| "loss": 0.0096, | |
| "reward": 12.381250366568565, | |
| "reward_std": 4.895804196596146, | |
| "rewards/accuracy_reward_staging": 1.0443749986588955, | |
| "rewards/format_reward": 0.953125, | |
| "rewards/format_reward_staging": 0.984375, | |
| "step": 68 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 578.734375, | |
| "epoch": 68.8, | |
| "grad_norm": 19.774598485500963, | |
| "kl": 0.20587158203125, | |
| "learning_rate": 1.9681476403781077e-06, | |
| "loss": 0.0525, | |
| "reward": 13.853125363588333, | |
| "reward_std": 4.133783400058746, | |
| "rewards/accuracy_reward_staging": 1.1900000181049109, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/format_reward_staging": 0.984375, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 69.8, | |
| "grad_norm": 1.2630722978620754, | |
| "learning_rate": 1.965925826289068e-06, | |
| "loss": -0.05, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 69.8, | |
| "eval_clip_ratio": 0.0, | |
| "eval_completion_length": 644.375, | |
| "eval_kl": 0.06834716796875, | |
| "eval_loss": 0.05038486793637276, | |
| "eval_reward": 13.201250171661377, | |
| "eval_reward_std": 5.853598284721374, | |
| "eval_rewards/accuracy_reward_staging": 1.1376250192523003, | |
| "eval_rewards/format_reward": 0.8875, | |
| "eval_rewards/format_reward_staging": 0.9375, | |
| "eval_runtime": 152.6561, | |
| "eval_samples_per_second": 0.131, | |
| "eval_steps_per_second": 0.033, | |
| "step": 70 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 617.921875, | |
| "epoch": 70.8, | |
| "grad_norm": 1.3309524076426662, | |
| "kl": 0.0696563720703125, | |
| "learning_rate": 1.963630453208623e-06, | |
| "loss": 0.0613, | |
| "reward": 13.469531431794167, | |
| "reward_std": 5.184730686247349, | |
| "rewards/accuracy_reward_staging": 1.1508593847975135, | |
| "rewards/format_reward": 0.9765625, | |
| "rewards/format_reward_staging": 0.984375, | |
| "step": 71 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 673.703125, | |
| "epoch": 71.8, | |
| "grad_norm": 1.3030149455750017, | |
| "kl": 0.0673980712890625, | |
| "learning_rate": 1.9612616959383188e-06, | |
| "loss": 0.0537, | |
| "reward": 14.193750262260437, | |
| "reward_std": 4.9487489387393, | |
| "rewards/accuracy_reward_staging": 1.2318750098347664, | |
| "rewards/format_reward": 0.90625, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 72 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 646.5625, | |
| "epoch": 72.8, | |
| "grad_norm": 1.2069285825783527, | |
| "kl": 0.0667724609375, | |
| "learning_rate": 1.958819734868193e-06, | |
| "loss": 0.0452, | |
| "reward": 13.829687654972076, | |
| "reward_std": 4.151405468583107, | |
| "rewards/accuracy_reward_staging": 1.1845312640070915, | |
| "rewards/format_reward": 0.984375, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 73 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 675.25, | |
| "epoch": 73.8, | |
| "grad_norm": 12.30456605431996, | |
| "kl": 0.144561767578125, | |
| "learning_rate": 1.9563047559630356e-06, | |
| "loss": 0.0238, | |
| "reward": 15.618750125169754, | |
| "reward_std": 5.085296101868153, | |
| "rewards/accuracy_reward_staging": 1.3665625024586916, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/format_reward_staging": 0.984375, | |
| "step": 74 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 627.640625, | |
| "epoch": 74.8, | |
| "grad_norm": 435.4454579019399, | |
| "kl": 2.6427154541015625, | |
| "learning_rate": 1.953716950748227e-06, | |
| "loss": 0.1019, | |
| "reward": 14.799999952316284, | |
| "reward_std": 4.404626630246639, | |
| "rewards/accuracy_reward_staging": 1.2831250056624413, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 75 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 591.453125, | |
| "epoch": 75.8, | |
| "grad_norm": 1.5728290641754585, | |
| "kl": 0.0894012451171875, | |
| "learning_rate": 1.9510565162951534e-06, | |
| "loss": 0.0154, | |
| "reward": 14.187500238418579, | |
| "reward_std": 4.889563232660294, | |
| "rewards/accuracy_reward_staging": 1.2265625279396772, | |
| "rewards/format_reward": 0.9375, | |
| "rewards/format_reward_staging": 0.984375, | |
| "step": 76 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 618.296875, | |
| "epoch": 76.8, | |
| "grad_norm": 2.0802354331013824, | |
| "kl": 0.113311767578125, | |
| "learning_rate": 1.948323655206199e-06, | |
| "loss": 0.031, | |
| "reward": 14.854687571525574, | |
| "reward_std": 4.158232696354389, | |
| "rewards/accuracy_reward_staging": 1.2885937616229057, | |
| "rewards/format_reward": 0.984375, | |
| "rewards/format_reward_staging": 0.984375, | |
| "step": 77 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 652.46875, | |
| "epoch": 77.8, | |
| "grad_norm": 1.388498981753556, | |
| "kl": 0.086456298828125, | |
| "learning_rate": 1.945518575599317e-06, | |
| "loss": 0.0197, | |
| "reward": 14.209375262260437, | |
| "reward_std": 5.606824688613415, | |
| "rewards/accuracy_reward_staging": 1.2318750135600567, | |
| "rewards/format_reward": 0.921875, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 78 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 604.703125, | |
| "epoch": 78.8, | |
| "grad_norm": 1.2332136505931421, | |
| "kl": 0.080596923828125, | |
| "learning_rate": 1.9426414910921785e-06, | |
| "loss": 0.0222, | |
| "reward": 14.440624922513962, | |
| "reward_std": 4.533382810652256, | |
| "rewards/accuracy_reward_staging": 1.247187502682209, | |
| "rewards/format_reward": 0.984375, | |
| "rewards/format_reward_staging": 0.984375, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 79.8, | |
| "grad_norm": 1.2077461933733504, | |
| "learning_rate": 1.9396926207859082e-06, | |
| "loss": 0.0239, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 79.8, | |
| "eval_clip_ratio": 0.0, | |
| "eval_completion_length": 592.1125, | |
| "eval_kl": 0.077099609375, | |
| "eval_loss": 0.03515242785215378, | |
| "eval_reward": 14.468750166893006, | |
| "eval_reward_std": 4.639398086071014, | |
| "eval_rewards/accuracy_reward_staging": 1.2506250083446502, | |
| "eval_rewards/format_reward": 0.9875, | |
| "eval_rewards/format_reward_staging": 0.975, | |
| "eval_runtime": 133.8538, | |
| "eval_samples_per_second": 0.149, | |
| "eval_steps_per_second": 0.037, | |
| "step": 80 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 644.046875, | |
| "epoch": 80.8, | |
| "grad_norm": 1.673636310153387, | |
| "kl": 0.07987213134765625, | |
| "learning_rate": 1.9366721892483973e-06, | |
| "loss": 0.0333, | |
| "reward": 14.308594018220901, | |
| "reward_std": 3.8065029891440645, | |
| "rewards/accuracy_reward_staging": 1.233984388411045, | |
| "rewards/format_reward": 0.9765625, | |
| "rewards/format_reward_staging": 0.9921875, | |
| "step": 81 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 644.65625, | |
| "epoch": 81.8, | |
| "grad_norm": 1.2475722094071593, | |
| "kl": 0.07330322265625, | |
| "learning_rate": 1.9335804264972015e-06, | |
| "loss": -0.0326, | |
| "reward": 12.793750315904617, | |
| "reward_std": 4.888111189007759, | |
| "rewards/accuracy_reward_staging": 1.080937497317791, | |
| "rewards/format_reward": 0.984375, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 82 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 600.578125, | |
| "epoch": 82.8, | |
| "grad_norm": 1.1629304999796195, | |
| "kl": 0.0791168212890625, | |
| "learning_rate": 1.9304175679820247e-06, | |
| "loss": 0.0416, | |
| "reward": 12.628125369548798, | |
| "reward_std": 4.35176794230938, | |
| "rewards/accuracy_reward_staging": 1.0706250071525574, | |
| "rewards/format_reward": 0.9375, | |
| "rewards/format_reward_staging": 0.984375, | |
| "step": 83 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 557.15625, | |
| "epoch": 83.8, | |
| "grad_norm": 1.439668786268354, | |
| "kl": 0.084381103515625, | |
| "learning_rate": 1.9271838545667875e-06, | |
| "loss": 0.0776, | |
| "reward": 12.189062774181366, | |
| "reward_std": 4.2925035655498505, | |
| "rewards/accuracy_reward_staging": 1.0235937517136335, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/format_reward_staging": 0.984375, | |
| "step": 84 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 594.0625, | |
| "epoch": 84.8, | |
| "grad_norm": 1.3254589188157202, | |
| "kl": 0.0747528076171875, | |
| "learning_rate": 1.9238795325112867e-06, | |
| "loss": 0.0619, | |
| "reward": 15.909375101327896, | |
| "reward_std": 5.054341539740562, | |
| "rewards/accuracy_reward_staging": 1.3925000187009573, | |
| "rewards/format_reward": 0.984375, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 85 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 595.28125, | |
| "epoch": 85.8, | |
| "grad_norm": 1.2110318535178632, | |
| "kl": 0.0708465576171875, | |
| "learning_rate": 1.9205048534524403e-06, | |
| "loss": 0.0277, | |
| "reward": 13.023437589406967, | |
| "reward_std": 4.805721327662468, | |
| "rewards/accuracy_reward_staging": 1.1070312801748514, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/format_reward_staging": 0.984375, | |
| "step": 86 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 613.15625, | |
| "epoch": 86.8, | |
| "grad_norm": 1.3901357478872436, | |
| "kl": 0.0800323486328125, | |
| "learning_rate": 1.917060074385124e-06, | |
| "loss": 0.0142, | |
| "reward": 14.523437321186066, | |
| "reward_std": 4.984271876513958, | |
| "rewards/accuracy_reward_staging": 1.257031261920929, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/format_reward_staging": 0.984375, | |
| "step": 87 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 603.078125, | |
| "epoch": 87.8, | |
| "grad_norm": 1.338201226896483, | |
| "kl": 0.0803375244140625, | |
| "learning_rate": 1.9135454576426007e-06, | |
| "loss": 0.0304, | |
| "reward": 14.829687654972076, | |
| "reward_std": 6.223069980740547, | |
| "rewards/accuracy_reward_staging": 1.2939062640070915, | |
| "rewards/format_reward": 0.9375, | |
| "rewards/format_reward_staging": 0.953125, | |
| "step": 88 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 622.9375, | |
| "epoch": 88.8, | |
| "grad_norm": 2.521080873125213, | |
| "kl": 0.1446990966796875, | |
| "learning_rate": 1.909961270876543e-06, | |
| "loss": 0.0222, | |
| "reward": 14.043749928474426, | |
| "reward_std": 5.054637104272842, | |
| "rewards/accuracy_reward_staging": 1.2106250263750553, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 89.8, | |
| "grad_norm": 1.20377714499055, | |
| "learning_rate": 1.9063077870366499e-06, | |
| "loss": 0.048, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 89.8, | |
| "eval_clip_ratio": 0.0, | |
| "eval_completion_length": 648.2375, | |
| "eval_kl": 0.078076171875, | |
| "eval_loss": 0.07422037422657013, | |
| "eval_reward": 14.006250190734864, | |
| "eval_reward_std": 4.856718444824219, | |
| "eval_rewards/accuracy_reward_staging": 1.2081250160932542, | |
| "eval_rewards/format_reward": 0.95, | |
| "eval_rewards/format_reward_staging": 0.975, | |
| "eval_runtime": 145.5911, | |
| "eval_samples_per_second": 0.137, | |
| "eval_steps_per_second": 0.034, | |
| "step": 90 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 618.7265625, | |
| "epoch": 90.8, | |
| "grad_norm": 1.151785130187796, | |
| "kl": 0.0818328857421875, | |
| "learning_rate": 1.9025852843498606e-06, | |
| "loss": -0.0394, | |
| "reward": 14.92031255364418, | |
| "reward_std": 5.041832268238068, | |
| "rewards/accuracy_reward_staging": 1.2959375083446503, | |
| "rewards/format_reward": 0.9765625, | |
| "rewards/format_reward_staging": 0.984375, | |
| "step": 91 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 650.859375, | |
| "epoch": 91.8, | |
| "grad_norm": 1.0808576276629485, | |
| "kl": 0.0752716064453125, | |
| "learning_rate": 1.8987940462991669e-06, | |
| "loss": 0.0142, | |
| "reward": 13.932812631130219, | |
| "reward_std": 5.106485404074192, | |
| "rewards/accuracy_reward_staging": 1.1948437727987766, | |
| "rewards/format_reward": 0.984375, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 92 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 653.96875, | |
| "epoch": 92.8, | |
| "grad_norm": 1.1761749428173782, | |
| "kl": 0.0748443603515625, | |
| "learning_rate": 1.894934361602025e-06, | |
| "loss": 0.061, | |
| "reward": 13.756249964237213, | |
| "reward_std": 4.668057285249233, | |
| "rewards/accuracy_reward_staging": 1.185000006109476, | |
| "rewards/format_reward": 0.9375, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 93 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 643.140625, | |
| "epoch": 93.8, | |
| "grad_norm": 1.1826779106192027, | |
| "kl": 0.08154296875, | |
| "learning_rate": 1.8910065241883678e-06, | |
| "loss": 0.0113, | |
| "reward": 15.871875286102295, | |
| "reward_std": 5.009915418922901, | |
| "rewards/accuracy_reward_staging": 1.3918750323355198, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/format_reward_staging": 0.984375, | |
| "step": 94 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 640.71875, | |
| "epoch": 94.8, | |
| "grad_norm": 1.220691618497292, | |
| "kl": 0.087249755859375, | |
| "learning_rate": 1.8870108331782216e-06, | |
| "loss": 0.0364, | |
| "reward": 15.275000274181366, | |
| "reward_std": 5.186372339725494, | |
| "rewards/accuracy_reward_staging": 1.3353125043213367, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/format_reward_staging": 0.953125, | |
| "step": 95 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 638.9375, | |
| "epoch": 95.8, | |
| "grad_norm": 1.262017317075815, | |
| "kl": 0.092254638671875, | |
| "learning_rate": 1.8829475928589268e-06, | |
| "loss": 0.0112, | |
| "reward": 11.41250017285347, | |
| "reward_std": 5.790772080421448, | |
| "rewards/accuracy_reward_staging": 0.9459375087171793, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/format_reward_staging": 0.984375, | |
| "step": 96 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 650.140625, | |
| "epoch": 96.8, | |
| "grad_norm": 1.555339606868479, | |
| "kl": 0.088897705078125, | |
| "learning_rate": 1.8788171126619653e-06, | |
| "loss": 0.0167, | |
| "reward": 13.132812529802322, | |
| "reward_std": 5.363104030489922, | |
| "rewards/accuracy_reward_staging": 1.1226562578231096, | |
| "rewards/format_reward": 0.9375, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 97 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 685.40625, | |
| "epoch": 97.8, | |
| "grad_norm": 1.1855081064145405, | |
| "kl": 0.0860595703125, | |
| "learning_rate": 1.8746197071393956e-06, | |
| "loss": -0.0101, | |
| "reward": 14.51250010728836, | |
| "reward_std": 5.773593910038471, | |
| "rewards/accuracy_reward_staging": 1.2575000114738941, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 98 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 632.890625, | |
| "epoch": 98.8, | |
| "grad_norm": 1.2154159531082918, | |
| "kl": 0.092254638671875, | |
| "learning_rate": 1.8703556959398995e-06, | |
| "loss": 0.0378, | |
| "reward": 13.392187654972076, | |
| "reward_std": 5.218963444232941, | |
| "rewards/accuracy_reward_staging": 1.1423437520861626, | |
| "rewards/format_reward": 0.984375, | |
| "rewards/format_reward_staging": 0.984375, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 99.8, | |
| "grad_norm": 1.2652701809880356, | |
| "learning_rate": 1.8660254037844386e-06, | |
| "loss": 0.0158, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 99.8, | |
| "eval_clip_ratio": 0.0, | |
| "eval_completion_length": 596.325, | |
| "eval_kl": 0.170458984375, | |
| "eval_loss": 0.038617830723524094, | |
| "eval_reward": 13.322500276565552, | |
| "eval_reward_std": 4.48788731098175, | |
| "eval_rewards/accuracy_reward_staging": 1.1335000172257423, | |
| "eval_rewards/format_reward": 0.9875, | |
| "eval_rewards/format_reward_staging": 1.0, | |
| "eval_runtime": 137.9647, | |
| "eval_samples_per_second": 0.145, | |
| "eval_steps_per_second": 0.036, | |
| "step": 100 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 625.546875, | |
| "epoch": 100.8, | |
| "grad_norm": 1.158984032803143, | |
| "kl": 0.0985870361328125, | |
| "learning_rate": 1.8616291604415257e-06, | |
| "loss": 0.0013, | |
| "reward": 13.735937476158142, | |
| "reward_std": 5.272327609360218, | |
| "rewards/accuracy_reward_staging": 1.174375013448298, | |
| "rewards/format_reward": 0.9921875, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 101 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 610.265625, | |
| "epoch": 101.8, | |
| "grad_norm": 1.1842728351019054, | |
| "kl": 0.095916748046875, | |
| "learning_rate": 1.8571673007021123e-06, | |
| "loss": 0.0156, | |
| "reward": 15.284374952316284, | |
| "reward_std": 4.7574154287576675, | |
| "rewards/accuracy_reward_staging": 1.330000001937151, | |
| "rewards/format_reward": 0.984375, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 102 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 640.171875, | |
| "epoch": 102.8, | |
| "grad_norm": 1.2460167784481468, | |
| "kl": 0.09686279296875, | |
| "learning_rate": 1.852640164354092e-06, | |
| "loss": -0.0181, | |
| "reward": 14.125000357627869, | |
| "reward_std": 4.396180346608162, | |
| "rewards/accuracy_reward_staging": 1.2203124929219484, | |
| "rewards/format_reward": 0.953125, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 103 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 632.03125, | |
| "epoch": 103.8, | |
| "grad_norm": 1.2093960835662856, | |
| "kl": 0.096099853515625, | |
| "learning_rate": 1.8480480961564257e-06, | |
| "loss": -0.0125, | |
| "reward": 15.537500262260437, | |
| "reward_std": 4.605620868504047, | |
| "rewards/accuracy_reward_staging": 1.3553125225007534, | |
| "rewards/format_reward": 0.984375, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 104 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 626.1875, | |
| "epoch": 104.8, | |
| "grad_norm": 16.01421157394276, | |
| "kl": 0.21142578125, | |
| "learning_rate": 1.8433914458128857e-06, | |
| "loss": 0.0579, | |
| "reward": 13.903125166893005, | |
| "reward_std": 6.153450347483158, | |
| "rewards/accuracy_reward_staging": 1.1950000002980232, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/format_reward_staging": 0.984375, | |
| "step": 105 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 646.25, | |
| "epoch": 105.8, | |
| "grad_norm": 1.3180024354800577, | |
| "kl": 0.10040283203125, | |
| "learning_rate": 1.838670567945424e-06, | |
| "loss": 0.068, | |
| "reward": 13.818750381469727, | |
| "reward_std": 5.729592114686966, | |
| "rewards/accuracy_reward_staging": 1.189687505364418, | |
| "rewards/format_reward": 0.9375, | |
| "rewards/format_reward_staging": 0.984375, | |
| "step": 106 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 607.875, | |
| "epoch": 106.8, | |
| "grad_norm": 2.9653782850707398, | |
| "kl": 0.14129638671875, | |
| "learning_rate": 1.833885822067168e-06, | |
| "loss": 0.0536, | |
| "reward": 15.423437595367432, | |
| "reward_std": 6.023381091654301, | |
| "rewards/accuracy_reward_staging": 1.3454687595367432, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 107 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 644.546875, | |
| "epoch": 107.8, | |
| "grad_norm": 1.4820796355726387, | |
| "kl": 0.09906005859375, | |
| "learning_rate": 1.8290375725550415e-06, | |
| "loss": 0.097, | |
| "reward": 14.023437529802322, | |
| "reward_std": 6.225167877972126, | |
| "rewards/accuracy_reward_staging": 1.2054687719792128, | |
| "rewards/format_reward": 0.984375, | |
| "rewards/format_reward_staging": 0.984375, | |
| "step": 108 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 585.25, | |
| "epoch": 108.8, | |
| "grad_norm": 1.653023129371335, | |
| "kl": 0.114166259765625, | |
| "learning_rate": 1.8241261886220154e-06, | |
| "loss": 0.0807, | |
| "reward": 14.356250017881393, | |
| "reward_std": 5.447244621813297, | |
| "rewards/accuracy_reward_staging": 1.2371875084936619, | |
| "rewards/format_reward": 0.984375, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 109 | |
| }, | |
| { | |
| "epoch": 109.8, | |
| "grad_norm": 1.2519961806572524, | |
| "learning_rate": 1.8191520442889917e-06, | |
| "loss": 0.0487, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 109.8, | |
| "eval_clip_ratio": 0.0, | |
| "eval_completion_length": 607.9625, | |
| "eval_kl": 0.098583984375, | |
| "eval_loss": 0.015465144999325275, | |
| "eval_reward": 14.107500028610229, | |
| "eval_reward_std": 5.255042427778244, | |
| "eval_rewards/accuracy_reward_staging": 1.2157500088214874, | |
| "eval_rewards/format_reward": 0.9625, | |
| "eval_rewards/format_reward_staging": 0.9875, | |
| "eval_runtime": 141.9444, | |
| "eval_samples_per_second": 0.141, | |
| "eval_steps_per_second": 0.035, | |
| "step": 110 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 595.484375, | |
| "epoch": 110.8, | |
| "grad_norm": 1.2811054745764274, | |
| "kl": 0.112457275390625, | |
| "learning_rate": 1.8141155183563193e-06, | |
| "loss": 0.0085, | |
| "reward": 14.736718833446503, | |
| "reward_std": 5.505104329437017, | |
| "rewards/accuracy_reward_staging": 1.2760156439617276, | |
| "rewards/format_reward": 0.984375, | |
| "rewards/format_reward_staging": 0.9921875, | |
| "step": 111 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 589.59375, | |
| "epoch": 111.8, | |
| "grad_norm": 1.2357687466322653, | |
| "kl": 0.1142578125, | |
| "learning_rate": 1.8090169943749474e-06, | |
| "loss": -0.0064, | |
| "reward": 13.935937494039536, | |
| "reward_std": 4.747958414256573, | |
| "rewards/accuracy_reward_staging": 1.195156266912818, | |
| "rewards/format_reward": 1.0, | |
| "rewards/format_reward_staging": 0.984375, | |
| "step": 112 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 604.953125, | |
| "epoch": 112.8, | |
| "grad_norm": 1.4229471167256136, | |
| "kl": 0.14752197265625, | |
| "learning_rate": 1.803856860617217e-06, | |
| "loss": 0.0281, | |
| "reward": 13.79843756556511, | |
| "reward_std": 5.385790981352329, | |
| "rewards/accuracy_reward_staging": 1.1845312714576721, | |
| "rewards/format_reward": 0.984375, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 113 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 611.328125, | |
| "epoch": 113.8, | |
| "grad_norm": 24.422765952176345, | |
| "kl": 0.330352783203125, | |
| "learning_rate": 1.7986355100472927e-06, | |
| "loss": 0.0504, | |
| "reward": 14.092187762260437, | |
| "reward_std": 5.095987647771835, | |
| "rewards/accuracy_reward_staging": 1.2107812762260437, | |
| "rewards/format_reward": 1.0, | |
| "rewards/format_reward_staging": 0.984375, | |
| "step": 114 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 567.453125, | |
| "epoch": 114.8, | |
| "grad_norm": 5.413230038864145, | |
| "kl": 0.17083740234375, | |
| "learning_rate": 1.7933533402912351e-06, | |
| "loss": 0.0736, | |
| "reward": 13.521874904632568, | |
| "reward_std": 4.76890967041254, | |
| "rewards/accuracy_reward_staging": 1.1584375277161598, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 115 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 589.703125, | |
| "epoch": 115.8, | |
| "grad_norm": 1.9014947184206379, | |
| "kl": 0.17449951171875, | |
| "learning_rate": 1.7880107536067217e-06, | |
| "loss": 0.0221, | |
| "reward": 12.971875101327896, | |
| "reward_std": 5.6128582283854485, | |
| "rewards/accuracy_reward_staging": 1.1065624989569187, | |
| "rewards/format_reward": 0.953125, | |
| "rewards/format_reward_staging": 0.953125, | |
| "step": 116 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 593.71875, | |
| "epoch": 116.8, | |
| "grad_norm": 5.015821786557117, | |
| "kl": 0.3743896484375, | |
| "learning_rate": 1.7826081568524138e-06, | |
| "loss": 0.0006, | |
| "reward": 14.44375005364418, | |
| "reward_std": 5.561103023588657, | |
| "rewards/accuracy_reward_staging": 1.247500006109476, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 117 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 541.34375, | |
| "epoch": 117.8, | |
| "grad_norm": 2.862584895806164, | |
| "kl": 0.159332275390625, | |
| "learning_rate": 1.7771459614569707e-06, | |
| "loss": -0.0004, | |
| "reward": 13.903124928474426, | |
| "reward_std": 4.877812258899212, | |
| "rewards/accuracy_reward_staging": 1.1950000133365393, | |
| "rewards/format_reward": 1.0, | |
| "rewards/format_reward_staging": 0.953125, | |
| "step": 118 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 552.203125, | |
| "epoch": 118.8, | |
| "grad_norm": 1.2925016251536252, | |
| "kl": 0.136077880859375, | |
| "learning_rate": 1.7716245833877198e-06, | |
| "loss": 0.0437, | |
| "reward": 14.979687631130219, | |
| "reward_std": 5.167752608656883, | |
| "rewards/accuracy_reward_staging": 1.3042187504470348, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 119 | |
| }, | |
| { | |
| "epoch": 119.8, | |
| "grad_norm": 1.268704610469971, | |
| "learning_rate": 1.766044443118978e-06, | |
| "loss": 0.0381, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 119.8, | |
| "eval_clip_ratio": 0.0, | |
| "eval_completion_length": 613.875, | |
| "eval_kl": 0.1421142578125, | |
| "eval_loss": 0.042472995817661285, | |
| "eval_reward": 14.473749923706055, | |
| "eval_reward_std": 4.867543476819992, | |
| "eval_rewards/accuracy_reward_staging": 1.2536250054836273, | |
| "eval_rewards/format_reward": 0.95, | |
| "eval_rewards/format_reward_staging": 0.9875, | |
| "eval_runtime": 140.6458, | |
| "eval_samples_per_second": 0.142, | |
| "eval_steps_per_second": 0.036, | |
| "step": 120 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 632.1796875, | |
| "epoch": 120.8, | |
| "grad_norm": 1.232115329282542, | |
| "kl": 0.1269073486328125, | |
| "learning_rate": 1.760405965600031e-06, | |
| "loss": 0.0088, | |
| "reward": 14.570312559604645, | |
| "reward_std": 5.063761539757252, | |
| "rewards/accuracy_reward_staging": 1.262500001117587, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/format_reward_staging": 0.9765625, | |
| "step": 121 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 631.671875, | |
| "epoch": 121.8, | |
| "grad_norm": 1.3808041357571472, | |
| "kl": 0.14166259765625, | |
| "learning_rate": 1.7547095802227721e-06, | |
| "loss": 0.0158, | |
| "reward": 13.695312559604645, | |
| "reward_std": 5.737492188811302, | |
| "rewards/accuracy_reward_staging": 1.1742187663912773, | |
| "rewards/format_reward": 1.0, | |
| "rewards/format_reward_staging": 0.953125, | |
| "step": 122 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 635.234375, | |
| "epoch": 122.8, | |
| "grad_norm": 1.247279598577896, | |
| "kl": 0.12890625, | |
| "learning_rate": 1.7489557207890023e-06, | |
| "loss": 0.0455, | |
| "reward": 12.946875035762787, | |
| "reward_std": 4.728762552142143, | |
| "rewards/accuracy_reward_staging": 1.100937519222498, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 123 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 578.359375, | |
| "epoch": 123.8, | |
| "grad_norm": 1.3012048204012603, | |
| "kl": 0.14288330078125, | |
| "learning_rate": 1.743144825477394e-06, | |
| "loss": 0.0237, | |
| "reward": 14.440625131130219, | |
| "reward_std": 5.539311669766903, | |
| "rewards/accuracy_reward_staging": 1.2471875082701445, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 124 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 618.046875, | |
| "epoch": 124.8, | |
| "grad_norm": 1.3325921669841398, | |
| "kl": 0.1463623046875, | |
| "learning_rate": 1.737277336810124e-06, | |
| "loss": 0.0604, | |
| "reward": 12.951562702655792, | |
| "reward_std": 3.5424299761652946, | |
| "rewards/accuracy_reward_staging": 1.1014062613248825, | |
| "rewards/format_reward": 0.953125, | |
| "rewards/format_reward_staging": 0.984375, | |
| "step": 125 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 600.65625, | |
| "epoch": 125.8, | |
| "grad_norm": 1.692915618997029, | |
| "kl": 0.158050537109375, | |
| "learning_rate": 1.7313537016191704e-06, | |
| "loss": 0.0314, | |
| "reward": 15.428125023841858, | |
| "reward_std": 5.270965404808521, | |
| "rewards/accuracy_reward_staging": 1.3459375128149986, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 126 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 616.15625, | |
| "epoch": 126.8, | |
| "grad_norm": 1.1976264555767795, | |
| "kl": 0.1231689453125, | |
| "learning_rate": 1.7253743710122874e-06, | |
| "loss": -0.0521, | |
| "reward": 15.524999856948853, | |
| "reward_std": 4.5880225002765656, | |
| "rewards/accuracy_reward_staging": 1.3556249924004078, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 127 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 636.265625, | |
| "epoch": 127.8, | |
| "grad_norm": 1.5736176734740694, | |
| "kl": 0.14605712890625, | |
| "learning_rate": 1.719339800338651e-06, | |
| "loss": -0.0053, | |
| "reward": 13.117187559604645, | |
| "reward_std": 4.341549597680569, | |
| "rewards/accuracy_reward_staging": 1.1273437663912773, | |
| "rewards/format_reward": 0.890625, | |
| "rewards/format_reward_staging": 0.953125, | |
| "step": 128 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 646.5, | |
| "epoch": 128.8, | |
| "grad_norm": 2.8516785822154693, | |
| "kl": 0.19573974609375, | |
| "learning_rate": 1.7132504491541815e-06, | |
| "loss": -0.0363, | |
| "reward": 13.059375166893005, | |
| "reward_std": 4.451074585318565, | |
| "rewards/accuracy_reward_staging": 1.1121875122189522, | |
| "rewards/format_reward": 0.953125, | |
| "rewards/format_reward_staging": 0.984375, | |
| "step": 129 | |
| }, | |
| { | |
| "epoch": 129.8, | |
| "grad_norm": 1.330165854405246, | |
| "learning_rate": 1.7071067811865474e-06, | |
| "loss": 0.0375, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 129.8, | |
| "eval_clip_ratio": 0.0, | |
| "eval_completion_length": 637.2125, | |
| "eval_kl": 0.1220947265625, | |
| "eval_loss": 0.0286346934735775, | |
| "eval_reward": 14.201250052452087, | |
| "eval_reward_std": 5.366847103834152, | |
| "eval_rewards/accuracy_reward_staging": 1.2251250058412553, | |
| "eval_rewards/format_reward": 0.9625, | |
| "eval_rewards/format_reward_staging": 0.9875, | |
| "eval_runtime": 144.6284, | |
| "eval_samples_per_second": 0.138, | |
| "eval_steps_per_second": 0.035, | |
| "step": 130 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 584.5859375, | |
| "epoch": 130.8, | |
| "grad_norm": 1.307152401229373, | |
| "kl": 0.1373443603515625, | |
| "learning_rate": 1.7009092642998508e-06, | |
| "loss": -0.0099, | |
| "reward": 13.771875083446503, | |
| "reward_std": 5.035757407546043, | |
| "rewards/accuracy_reward_staging": 1.1803125254809856, | |
| "rewards/format_reward": 0.984375, | |
| "rewards/format_reward_staging": 0.984375, | |
| "step": 131 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 645.203125, | |
| "epoch": 131.8, | |
| "grad_norm": 1.2708814145835345, | |
| "kl": 0.129486083984375, | |
| "learning_rate": 1.6946583704589972e-06, | |
| "loss": 0.0643, | |
| "reward": 12.623437643051147, | |
| "reward_std": 5.542073376476765, | |
| "rewards/accuracy_reward_staging": 1.0654687564820051, | |
| "rewards/format_reward": 1.0, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 132 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 600.453125, | |
| "epoch": 132.8, | |
| "grad_norm": 1.293321609591619, | |
| "kl": 0.135955810546875, | |
| "learning_rate": 1.6883545756937537e-06, | |
| "loss": -0.0023, | |
| "reward": 13.604687690734863, | |
| "reward_std": 5.285826697945595, | |
| "rewards/accuracy_reward_staging": 1.163593776524067, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 133 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 622.75, | |
| "epoch": 133.8, | |
| "grad_norm": 1.25404998096487, | |
| "kl": 0.123260498046875, | |
| "learning_rate": 1.6819983600624985e-06, | |
| "loss": -0.0007, | |
| "reward": 13.665625005960464, | |
| "reward_std": 5.407533464720473, | |
| "rewards/accuracy_reward_staging": 1.1743750125169754, | |
| "rewards/format_reward": 0.953125, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 134 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 543.90625, | |
| "epoch": 134.8, | |
| "grad_norm": 1.4502660038061592, | |
| "kl": 0.146087646484375, | |
| "learning_rate": 1.6755902076156602e-06, | |
| "loss": 0.0388, | |
| "reward": 13.443750143051147, | |
| "reward_std": 6.642620116472244, | |
| "rewards/accuracy_reward_staging": 1.1490625031292439, | |
| "rewards/format_reward": 0.984375, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 135 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 652.296875, | |
| "epoch": 135.8, | |
| "grad_norm": 1.2487523655371195, | |
| "kl": 0.114105224609375, | |
| "learning_rate": 1.669130606358858e-06, | |
| "loss": 0.0277, | |
| "reward": 14.48281255364418, | |
| "reward_std": 5.782497301697731, | |
| "rewards/accuracy_reward_staging": 1.2576562836766243, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/format_reward_staging": 0.9375, | |
| "step": 136 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 624.75, | |
| "epoch": 136.8, | |
| "grad_norm": 1.3402764758528873, | |
| "kl": 0.133087158203125, | |
| "learning_rate": 1.6626200482157374e-06, | |
| "loss": 0.0515, | |
| "reward": 12.865624994039536, | |
| "reward_std": 4.773457303643227, | |
| "rewards/accuracy_reward_staging": 1.0959375277161598, | |
| "rewards/format_reward": 0.953125, | |
| "rewards/format_reward_staging": 0.953125, | |
| "step": 137 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 581.875, | |
| "epoch": 137.8, | |
| "grad_norm": 1.2528044000047398, | |
| "kl": 0.121673583984375, | |
| "learning_rate": 1.6560590289905071e-06, | |
| "loss": 0.0046, | |
| "reward": 14.821875035762787, | |
| "reward_std": 5.201211467385292, | |
| "rewards/accuracy_reward_staging": 1.2837499883025885, | |
| "rewards/format_reward": 1.0, | |
| "rewards/format_reward_staging": 0.984375, | |
| "step": 138 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 611.15625, | |
| "epoch": 138.8, | |
| "grad_norm": 1.6131227618811717, | |
| "kl": 0.118804931640625, | |
| "learning_rate": 1.6494480483301835e-06, | |
| "loss": 0.0186, | |
| "reward": 14.048437416553497, | |
| "reward_std": 4.07040748000145, | |
| "rewards/accuracy_reward_staging": 1.2157812491059303, | |
| "rewards/format_reward": 0.921875, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 139 | |
| }, | |
| { | |
| "epoch": 139.8, | |
| "grad_norm": 1.348569648877767, | |
| "learning_rate": 1.6427876096865393e-06, | |
| "loss": 0.0477, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 139.8, | |
| "eval_clip_ratio": 0.0, | |
| "eval_completion_length": 630.375, | |
| "eval_kl": 0.135888671875, | |
| "eval_loss": 0.021134015172719955, | |
| "eval_reward": 13.206250190734863, | |
| "eval_reward_std": 4.864674496650696, | |
| "eval_rewards/accuracy_reward_staging": 1.1293750241398812, | |
| "eval_rewards/format_reward": 0.95, | |
| "eval_rewards/format_reward_staging": 0.9625, | |
| "eval_runtime": 154.772, | |
| "eval_samples_per_second": 0.129, | |
| "eval_steps_per_second": 0.032, | |
| "step": 140 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 667.46875, | |
| "epoch": 140.8, | |
| "grad_norm": 1.3550397685799231, | |
| "kl": 0.1424407958984375, | |
| "learning_rate": 1.6360782202777638e-06, | |
| "loss": 0.022, | |
| "reward": 13.272656485438347, | |
| "reward_std": 5.271991036832333, | |
| "rewards/accuracy_reward_staging": 1.135859395377338, | |
| "rewards/format_reward": 0.9375, | |
| "rewards/format_reward_staging": 0.9765625, | |
| "step": 141 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 581.15625, | |
| "epoch": 141.8, | |
| "grad_norm": 1.274351995849414, | |
| "kl": 0.1263427734375, | |
| "learning_rate": 1.6293203910498375e-06, | |
| "loss": 0.0166, | |
| "reward": 13.015625029802322, | |
| "reward_std": 5.490728512406349, | |
| "rewards/accuracy_reward_staging": 1.1062499918043613, | |
| "rewards/format_reward": 0.953125, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 142 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 581.9375, | |
| "epoch": 142.8, | |
| "grad_norm": 1.4551759286658215, | |
| "kl": 0.131011962890625, | |
| "learning_rate": 1.6225146366376196e-06, | |
| "loss": 0.0763, | |
| "reward": 13.957812488079071, | |
| "reward_std": 5.092830486595631, | |
| "rewards/accuracy_reward_staging": 1.2004687525331974, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/format_reward_staging": 0.984375, | |
| "step": 143 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 616.640625, | |
| "epoch": 143.8, | |
| "grad_norm": 1.246097895354405, | |
| "kl": 0.108062744140625, | |
| "learning_rate": 1.615661475325658e-06, | |
| "loss": 0.0785, | |
| "reward": 13.190625131130219, | |
| "reward_std": 4.567478813230991, | |
| "rewards/accuracy_reward_staging": 1.1268750242888927, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/format_reward_staging": 0.953125, | |
| "step": 144 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 641.4375, | |
| "epoch": 144.8, | |
| "grad_norm": 1.3492898310297994, | |
| "kl": 0.137298583984375, | |
| "learning_rate": 1.6087614290087205e-06, | |
| "loss": 0.0778, | |
| "reward": 13.187500149011612, | |
| "reward_std": 5.05699796974659, | |
| "rewards/accuracy_reward_staging": 1.1281250044703484, | |
| "rewards/format_reward": 0.921875, | |
| "rewards/format_reward_staging": 0.984375, | |
| "step": 145 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 615.421875, | |
| "epoch": 145.8, | |
| "grad_norm": 1.2389330703315027, | |
| "kl": 0.11785888671875, | |
| "learning_rate": 1.6018150231520484e-06, | |
| "loss": -0.0105, | |
| "reward": 13.951562494039536, | |
| "reward_std": 4.975374720990658, | |
| "rewards/accuracy_reward_staging": 1.1951562650501728, | |
| "rewards/format_reward": 1.0, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 146 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 628.953125, | |
| "epoch": 146.8, | |
| "grad_norm": 1.1484661979759183, | |
| "kl": 0.10455322265625, | |
| "learning_rate": 1.5948227867513413e-06, | |
| "loss": 0.0083, | |
| "reward": 13.078125178813934, | |
| "reward_std": 4.869858503341675, | |
| "rewards/accuracy_reward_staging": 1.117187526077032, | |
| "rewards/format_reward": 0.9375, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 147 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 644.734375, | |
| "epoch": 147.8, | |
| "grad_norm": 3.8787328503041603, | |
| "kl": 0.148834228515625, | |
| "learning_rate": 1.587785252292473e-06, | |
| "loss": 0.0525, | |
| "reward": 12.510937660932541, | |
| "reward_std": 5.20218176394701, | |
| "rewards/accuracy_reward_staging": 1.0557812713086605, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/format_reward_staging": 0.984375, | |
| "step": 148 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 622.6875, | |
| "epoch": 148.8, | |
| "grad_norm": 1.2193760476315514, | |
| "kl": 0.125579833984375, | |
| "learning_rate": 1.5807029557109397e-06, | |
| "loss": 0.0084, | |
| "reward": 13.721875101327896, | |
| "reward_std": 5.613097697496414, | |
| "rewards/accuracy_reward_staging": 1.180000003427267, | |
| "rewards/format_reward": 0.953125, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 149 | |
| }, | |
| { | |
| "epoch": 149.8, | |
| "grad_norm": 1.3382966777749914, | |
| "learning_rate": 1.573576436351046e-06, | |
| "loss": 0.0283, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 149.8, | |
| "eval_clip_ratio": 0.0, | |
| "eval_completion_length": 617.3625, | |
| "eval_kl": 0.1303466796875, | |
| "eval_loss": 0.013808819465339184, | |
| "eval_reward": 13.037500023841858, | |
| "eval_reward_std": 4.972468680143356, | |
| "eval_rewards/accuracy_reward_staging": 1.1087500318884849, | |
| "eval_rewards/format_reward": 0.9625, | |
| "eval_rewards/format_reward_staging": 0.9875, | |
| "eval_runtime": 145.0293, | |
| "eval_samples_per_second": 0.138, | |
| "eval_steps_per_second": 0.034, | |
| "step": 150 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 587.78125, | |
| "epoch": 150.8, | |
| "grad_norm": 1.288894243312563, | |
| "kl": 0.124786376953125, | |
| "learning_rate": 1.5664062369248328e-06, | |
| "loss": 0.0259, | |
| "reward": 14.903124868869781, | |
| "reward_std": 6.114742249250412, | |
| "rewards/accuracy_reward_staging": 1.2934375274926424, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 151 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 604.6875, | |
| "epoch": 151.8, | |
| "grad_norm": 1.3773815626279793, | |
| "kl": 0.126007080078125, | |
| "learning_rate": 1.5591929034707466e-06, | |
| "loss": 0.0712, | |
| "reward": 14.687500357627869, | |
| "reward_std": 5.71131344884634, | |
| "rewards/accuracy_reward_staging": 1.273437511175871, | |
| "rewards/format_reward": 0.984375, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 152 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 615.921875, | |
| "epoch": 152.8, | |
| "grad_norm": 1.4485533003534696, | |
| "kl": 0.135894775390625, | |
| "learning_rate": 1.551936985312058e-06, | |
| "loss": 0.0497, | |
| "reward": 15.312499761581421, | |
| "reward_std": 4.291183479130268, | |
| "rewards/accuracy_reward_staging": 1.3562500244006515, | |
| "rewards/format_reward": 0.828125, | |
| "rewards/format_reward_staging": 0.921875, | |
| "step": 153 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 652.75, | |
| "epoch": 153.8, | |
| "grad_norm": 1.283755345470008, | |
| "kl": 0.14154052734375, | |
| "learning_rate": 1.544639035015027e-06, | |
| "loss": 0.0349, | |
| "reward": 13.10937511920929, | |
| "reward_std": 5.929120138287544, | |
| "rewards/accuracy_reward_staging": 1.1281250044703484, | |
| "rewards/format_reward": 0.859375, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 154 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 579.21875, | |
| "epoch": 154.8, | |
| "grad_norm": 4.3223327812781625, | |
| "kl": 0.21929931640625, | |
| "learning_rate": 1.537299608346824e-06, | |
| "loss": 0.0015, | |
| "reward": 16.134375244379044, | |
| "reward_std": 5.386517338454723, | |
| "rewards/accuracy_reward_staging": 1.4243750181049109, | |
| "rewards/format_reward": 0.921875, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 155 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 616.15625, | |
| "epoch": 155.8, | |
| "grad_norm": 1.600841914035174, | |
| "kl": 0.154449462890625, | |
| "learning_rate": 1.5299192642332049e-06, | |
| "loss": 0.0289, | |
| "reward": 14.656250029802322, | |
| "reward_std": 5.294310428202152, | |
| "rewards/accuracy_reward_staging": 1.2781250141561031, | |
| "rewards/format_reward": 0.890625, | |
| "rewards/format_reward_staging": 0.984375, | |
| "step": 156 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 589.578125, | |
| "epoch": 156.8, | |
| "grad_norm": 1.4513044601818303, | |
| "kl": 0.16229248046875, | |
| "learning_rate": 1.5224985647159488e-06, | |
| "loss": 0.0441, | |
| "reward": 14.590624958276749, | |
| "reward_std": 5.033867612481117, | |
| "rewards/accuracy_reward_staging": 1.273124998435378, | |
| "rewards/format_reward": 0.921875, | |
| "rewards/format_reward_staging": 0.9375, | |
| "step": 157 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 599.515625, | |
| "epoch": 157.8, | |
| "grad_norm": 1.3153063149563227, | |
| "kl": 0.148529052734375, | |
| "learning_rate": 1.5150380749100543e-06, | |
| "loss": 0.0618, | |
| "reward": 15.898437559604645, | |
| "reward_std": 4.4645668268203735, | |
| "rewards/accuracy_reward_staging": 1.4023437574505806, | |
| "rewards/format_reward": 0.90625, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 158 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 604.359375, | |
| "epoch": 158.8, | |
| "grad_norm": 8.479997272303532, | |
| "kl": 0.165130615234375, | |
| "learning_rate": 1.5075383629607041e-06, | |
| "loss": 0.0777, | |
| "reward": 13.142187595367432, | |
| "reward_std": 5.132370471954346, | |
| "rewards/accuracy_reward_staging": 1.132968744263053, | |
| "rewards/format_reward": 0.90625, | |
| "rewards/format_reward_staging": 0.90625, | |
| "step": 159 | |
| }, | |
| { | |
| "epoch": 159.8, | |
| "grad_norm": 1.1917014388947642, | |
| "learning_rate": 1.5e-06, | |
| "loss": -0.0404, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 159.8, | |
| "eval_clip_ratio": 0.0, | |
| "eval_completion_length": 638.1625, | |
| "eval_kl": 0.14365234375, | |
| "eval_loss": 0.08888934552669525, | |
| "eval_reward": 14.796250009536744, | |
| "eval_reward_std": 5.661263364553451, | |
| "eval_rewards/accuracy_reward_staging": 1.295874996483326, | |
| "eval_rewards/format_reward": 0.9, | |
| "eval_rewards/format_reward_staging": 0.9375, | |
| "eval_runtime": 146.8527, | |
| "eval_samples_per_second": 0.136, | |
| "eval_steps_per_second": 0.034, | |
| "step": 160 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 651.7421875, | |
| "epoch": 160.8, | |
| "grad_norm": 1.2387655968511657, | |
| "kl": 0.1446075439453125, | |
| "learning_rate": 1.4924235601034672e-06, | |
| "loss": 0.0701, | |
| "reward": 15.571094110608101, | |
| "reward_std": 5.18467765673995, | |
| "rewards/accuracy_reward_staging": 1.3766406429931521, | |
| "rewards/format_reward": 0.84375, | |
| "rewards/format_reward_staging": 0.9609375, | |
| "step": 161 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 600.4375, | |
| "epoch": 161.8, | |
| "grad_norm": 1.229465314986415, | |
| "kl": 0.13531494140625, | |
| "learning_rate": 1.4848096202463372e-06, | |
| "loss": -0.0057, | |
| "reward": 16.17187523841858, | |
| "reward_std": 4.812445372343063, | |
| "rewards/accuracy_reward_staging": 1.4234375022351742, | |
| "rewards/format_reward": 0.9375, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 162 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 582.125, | |
| "epoch": 162.8, | |
| "grad_norm": 1.3529028258154412, | |
| "kl": 0.1470947265625, | |
| "learning_rate": 1.4771587602596083e-06, | |
| "loss": 0.0891, | |
| "reward": 15.801562637090683, | |
| "reward_std": 4.80203927308321, | |
| "rewards/accuracy_reward_staging": 1.392656246200204, | |
| "rewards/format_reward": 0.90625, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 163 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 611.609375, | |
| "epoch": 163.8, | |
| "grad_norm": 1.4068895541908286, | |
| "kl": 0.15350341796875, | |
| "learning_rate": 1.4694715627858908e-06, | |
| "loss": 0.0591, | |
| "reward": 14.951562643051147, | |
| "reward_std": 5.22976279258728, | |
| "rewards/accuracy_reward_staging": 1.3045312520116568, | |
| "rewards/format_reward": 0.9375, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 164 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 609.671875, | |
| "epoch": 164.8, | |
| "grad_norm": 1.2658845630248547, | |
| "kl": 0.1463623046875, | |
| "learning_rate": 1.461748613235034e-06, | |
| "loss": 0.0266, | |
| "reward": 14.03125, | |
| "reward_std": 5.7937397211790085, | |
| "rewards/accuracy_reward_staging": 1.215625025331974, | |
| "rewards/format_reward": 0.921875, | |
| "rewards/format_reward_staging": 0.953125, | |
| "step": 165 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 589.875, | |
| "epoch": 165.8, | |
| "grad_norm": 1.3344028785966706, | |
| "kl": 0.1558837890625, | |
| "learning_rate": 1.4539904997395467e-06, | |
| "loss": 0.0711, | |
| "reward": 15.390625029802322, | |
| "reward_std": 4.8061781376600266, | |
| "rewards/accuracy_reward_staging": 1.3515625335276127, | |
| "rewards/format_reward": 0.890625, | |
| "rewards/format_reward_staging": 0.984375, | |
| "step": 166 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 607.0625, | |
| "epoch": 166.8, | |
| "grad_norm": 1.6122384935377623, | |
| "kl": 0.178619384765625, | |
| "learning_rate": 1.4461978131098087e-06, | |
| "loss": 0.0224, | |
| "reward": 12.757812649011612, | |
| "reward_std": 5.976896375417709, | |
| "rewards/accuracy_reward_staging": 1.0867187604308128, | |
| "rewards/format_reward": 0.921875, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 167 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 555.21875, | |
| "epoch": 167.8, | |
| "grad_norm": 1.3011797765950504, | |
| "kl": 0.15509033203125, | |
| "learning_rate": 1.4383711467890773e-06, | |
| "loss": -0.0096, | |
| "reward": 16.72499978542328, | |
| "reward_std": 4.908542029559612, | |
| "rewards/accuracy_reward_staging": 1.4756250157952309, | |
| "rewards/format_reward": 0.984375, | |
| "rewards/format_reward_staging": 0.984375, | |
| "step": 168 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 619.546875, | |
| "epoch": 168.8, | |
| "grad_norm": 1.71128104516776, | |
| "kl": 0.15618896484375, | |
| "learning_rate": 1.430511096808295e-06, | |
| "loss": 0.1091, | |
| "reward": 15.999999672174454, | |
| "reward_std": 4.338721185922623, | |
| "rewards/accuracy_reward_staging": 1.4124999977648258, | |
| "rewards/format_reward": 0.90625, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 169 | |
| }, | |
| { | |
| "epoch": 169.8, | |
| "grad_norm": 1.406915756546695, | |
| "learning_rate": 1.4226182617406994e-06, | |
| "loss": 0.0365, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 169.8, | |
| "eval_clip_ratio": 0.0, | |
| "eval_completion_length": 599.3375, | |
| "eval_kl": 0.1652587890625, | |
| "eval_loss": 0.05737342685461044, | |
| "eval_reward": 15.347499918937682, | |
| "eval_reward_std": 5.264538067579269, | |
| "eval_rewards/accuracy_reward_staging": 1.346000000834465, | |
| "eval_rewards/format_reward": 0.925, | |
| "eval_rewards/format_reward_staging": 0.9625, | |
| "eval_runtime": 128.6979, | |
| "eval_samples_per_second": 0.155, | |
| "eval_steps_per_second": 0.039, | |
| "step": 170 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 607.75, | |
| "epoch": 170.8, | |
| "grad_norm": 1.4282138543951368, | |
| "kl": 0.17828369140625, | |
| "learning_rate": 1.414693242656239e-06, | |
| "loss": -0.0169, | |
| "reward": 15.774218946695328, | |
| "reward_std": 5.630698639899492, | |
| "rewards/accuracy_reward_staging": 1.3899218812584877, | |
| "rewards/format_reward": 0.921875, | |
| "rewards/format_reward_staging": 0.953125, | |
| "step": 171 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 597.484375, | |
| "epoch": 171.8, | |
| "grad_norm": 2.842026235946327, | |
| "kl": 0.22344970703125, | |
| "learning_rate": 1.4067366430758004e-06, | |
| "loss": 0.0117, | |
| "reward": 13.404687702655792, | |
| "reward_std": 5.611785896122456, | |
| "rewards/accuracy_reward_staging": 1.1482812836766243, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/format_reward_staging": 0.953125, | |
| "step": 172 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 593.34375, | |
| "epoch": 172.8, | |
| "grad_norm": 1.355496874052129, | |
| "kl": 0.178131103515625, | |
| "learning_rate": 1.3987490689252462e-06, | |
| "loss": 0.0242, | |
| "reward": 15.953125089406967, | |
| "reward_std": 4.999604664742947, | |
| "rewards/accuracy_reward_staging": 1.403124986216426, | |
| "rewards/format_reward": 0.921875, | |
| "rewards/format_reward_staging": 1.0, | |
| "step": 173 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 636.546875, | |
| "epoch": 173.8, | |
| "grad_norm": 1.2708001429966154, | |
| "kl": 0.1480712890625, | |
| "learning_rate": 1.3907311284892735e-06, | |
| "loss": 0.0781, | |
| "reward": 16.860937863588333, | |
| "reward_std": 5.687329366803169, | |
| "rewards/accuracy_reward_staging": 1.4970312751829624, | |
| "rewards/format_reward": 0.921875, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 174 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 629.34375, | |
| "epoch": 174.8, | |
| "grad_norm": 1.2881497238833757, | |
| "kl": 0.184661865234375, | |
| "learning_rate": 1.3826834323650898e-06, | |
| "loss": 0.0465, | |
| "reward": 14.581250131130219, | |
| "reward_std": 5.882216438651085, | |
| "rewards/accuracy_reward_staging": 1.2831250075250864, | |
| "rewards/format_reward": 0.875, | |
| "rewards/format_reward_staging": 0.875, | |
| "step": 175 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 636.1875, | |
| "epoch": 175.8, | |
| "grad_norm": 1.3087334876201022, | |
| "kl": 0.180084228515625, | |
| "learning_rate": 1.374606593415912e-06, | |
| "loss": 0.0515, | |
| "reward": 15.878125071525574, | |
| "reward_std": 5.341188468039036, | |
| "rewards/accuracy_reward_staging": 1.4081249758601189, | |
| "rewards/format_reward": 0.859375, | |
| "rewards/format_reward_staging": 0.9375, | |
| "step": 176 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 620.109375, | |
| "epoch": 176.8, | |
| "grad_norm": 1.4376210080253358, | |
| "kl": 0.1929931640625, | |
| "learning_rate": 1.3665012267242972e-06, | |
| "loss": 0.0086, | |
| "reward": 13.495312541723251, | |
| "reward_std": 5.61242138594389, | |
| "rewards/accuracy_reward_staging": 1.1635937709361315, | |
| "rewards/format_reward": 0.921875, | |
| "rewards/format_reward_staging": 0.9375, | |
| "step": 177 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 626.03125, | |
| "epoch": 177.8, | |
| "grad_norm": 1772489.589709048, | |
| "kl": 7520.199645996094, | |
| "learning_rate": 1.3583679495453e-06, | |
| "loss": 413.327, | |
| "reward": 14.464062601327896, | |
| "reward_std": 6.149559870362282, | |
| "rewards/accuracy_reward_staging": 1.2729687709361315, | |
| "rewards/format_reward": 0.859375, | |
| "rewards/format_reward_staging": 0.875, | |
| "step": 178 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 622.34375, | |
| "epoch": 178.8, | |
| "grad_norm": 1.2715649881539406, | |
| "kl": 0.173065185546875, | |
| "learning_rate": 1.3502073812594674e-06, | |
| "loss": -0.0255, | |
| "reward": 16.45468744635582, | |
| "reward_std": 5.025842607021332, | |
| "rewards/accuracy_reward_staging": 1.4610937684774399, | |
| "rewards/format_reward": 0.875, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 179 | |
| }, | |
| { | |
| "epoch": 179.8, | |
| "grad_norm": 1.3470338291497799, | |
| "learning_rate": 1.3420201433256689e-06, | |
| "loss": 0.0132, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 179.8, | |
| "eval_clip_ratio": 0.0, | |
| "eval_completion_length": 644.8375, | |
| "eval_kl": 0.1693603515625, | |
| "eval_loss": 0.07019542157649994, | |
| "eval_reward": 15.515000081062317, | |
| "eval_reward_std": 4.510845869779587, | |
| "eval_rewards/accuracy_reward_staging": 1.3702499970793725, | |
| "eval_rewards/format_reward": 0.8625, | |
| "eval_rewards/format_reward_staging": 0.95, | |
| "eval_runtime": 140.9488, | |
| "eval_samples_per_second": 0.142, | |
| "eval_steps_per_second": 0.035, | |
| "step": 180 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 602.9765625, | |
| "epoch": 180.8, | |
| "grad_norm": 1.3491479753013993, | |
| "kl": 0.183990478515625, | |
| "learning_rate": 1.3338068592337708e-06, | |
| "loss": 0.0611, | |
| "reward": 15.047656297683716, | |
| "reward_std": 5.386000510305166, | |
| "rewards/accuracy_reward_staging": 1.3149218782782555, | |
| "rewards/format_reward": 0.9453125, | |
| "rewards/format_reward_staging": 0.953125, | |
| "step": 181 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 595.71875, | |
| "epoch": 181.8, | |
| "grad_norm": 1.3770681193765026, | |
| "kl": 0.19049072265625, | |
| "learning_rate": 1.3255681544571566e-06, | |
| "loss": 0.0247, | |
| "reward": 15.55312505364418, | |
| "reward_std": 5.63801646232605, | |
| "rewards/accuracy_reward_staging": 1.3615624941885471, | |
| "rewards/format_reward": 0.953125, | |
| "rewards/format_reward_staging": 0.984375, | |
| "step": 182 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 615.59375, | |
| "epoch": 182.8, | |
| "grad_norm": 1.4258618367759082, | |
| "kl": 0.18341064453125, | |
| "learning_rate": 1.3173046564050923e-06, | |
| "loss": 0.0382, | |
| "reward": 15.492187589406967, | |
| "reward_std": 5.1280196234583855, | |
| "rewards/accuracy_reward_staging": 1.3664062581956387, | |
| "rewards/format_reward": 0.890625, | |
| "rewards/format_reward_staging": 0.9375, | |
| "step": 183 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 657.21875, | |
| "epoch": 183.8, | |
| "grad_norm": 1.4496706627719373, | |
| "kl": 0.2099609375, | |
| "learning_rate": 1.3090169943749473e-06, | |
| "loss": -0.0093, | |
| "reward": 15.481249988079071, | |
| "reward_std": 4.195666573941708, | |
| "rewards/accuracy_reward_staging": 1.360624998807907, | |
| "rewards/format_reward": 0.90625, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 184 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 643.625, | |
| "epoch": 184.8, | |
| "grad_norm": 12.416744832467893, | |
| "kl": 0.35125732421875, | |
| "learning_rate": 1.3007057995042729e-06, | |
| "loss": 0.0552, | |
| "reward": 16.317187398672104, | |
| "reward_std": 5.490992607548833, | |
| "rewards/accuracy_reward_staging": 1.4489062502980232, | |
| "rewards/format_reward": 0.859375, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 185 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 623.515625, | |
| "epoch": 185.8, | |
| "grad_norm": 1.3589509266087427, | |
| "kl": 0.20068359375, | |
| "learning_rate": 1.2923717047227368e-06, | |
| "loss": 0.0935, | |
| "reward": 13.135937482118607, | |
| "reward_std": 5.502887517213821, | |
| "rewards/accuracy_reward_staging": 1.1276562362909317, | |
| "rewards/format_reward": 0.90625, | |
| "rewards/format_reward_staging": 0.953125, | |
| "step": 186 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 617.5625, | |
| "epoch": 186.8, | |
| "grad_norm": 5.614254453009779, | |
| "kl": 0.2403564453125, | |
| "learning_rate": 1.2840153447039228e-06, | |
| "loss": 0.0561, | |
| "reward": 14.834374755620956, | |
| "reward_std": 6.0779377073049545, | |
| "rewards/accuracy_reward_staging": 1.2990624997764826, | |
| "rewards/format_reward": 0.890625, | |
| "rewards/format_reward_staging": 0.953125, | |
| "step": 187 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 636.03125, | |
| "epoch": 187.8, | |
| "grad_norm": 1.3627377674239467, | |
| "kl": 0.1864013671875, | |
| "learning_rate": 1.275637355816999e-06, | |
| "loss": 0.078, | |
| "reward": 13.956250250339508, | |
| "reward_std": 5.885370120406151, | |
| "rewards/accuracy_reward_staging": 1.2112500164657831, | |
| "rewards/format_reward": 0.921875, | |
| "rewards/format_reward_staging": 0.921875, | |
| "step": 188 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 605.875, | |
| "epoch": 188.8, | |
| "grad_norm": 1.4102127840998477, | |
| "kl": 0.1846923828125, | |
| "learning_rate": 1.2672383760782567e-06, | |
| "loss": 0.0346, | |
| "reward": 14.612500011920929, | |
| "reward_std": 6.431307382881641, | |
| "rewards/accuracy_reward_staging": 1.2737499997019768, | |
| "rewards/format_reward": 0.921875, | |
| "rewards/format_reward_staging": 0.953125, | |
| "step": 189 | |
| }, | |
| { | |
| "epoch": 189.8, | |
| "grad_norm": 1.2657153918632975, | |
| "learning_rate": 1.2588190451025207e-06, | |
| "loss": 0.0905, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 189.8, | |
| "eval_clip_ratio": 0.0, | |
| "eval_completion_length": 639.3875, | |
| "eval_kl": 0.183544921875, | |
| "eval_loss": 0.09273408353328705, | |
| "eval_reward": 14.913750052452087, | |
| "eval_reward_std": 4.527625149488449, | |
| "eval_rewards/accuracy_reward_staging": 1.312625017762184, | |
| "eval_rewards/format_reward": 0.875, | |
| "eval_rewards/format_reward_staging": 0.9125, | |
| "eval_runtime": 149.5357, | |
| "eval_samples_per_second": 0.134, | |
| "eval_steps_per_second": 0.033, | |
| "step": 190 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 637.03125, | |
| "epoch": 190.8, | |
| "grad_norm": 1.5561907060681732, | |
| "kl": 0.20361328125, | |
| "learning_rate": 1.2503800040544414e-06, | |
| "loss": 0.027, | |
| "reward": 13.858593851327896, | |
| "reward_std": 5.260060213506222, | |
| "rewards/accuracy_reward_staging": 1.199921895749867, | |
| "rewards/format_reward": 0.9140625, | |
| "rewards/format_reward_staging": 0.9453125, | |
| "step": 191 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 614.578125, | |
| "epoch": 191.8, | |
| "grad_norm": 1.4112962880914386, | |
| "kl": 0.173828125, | |
| "learning_rate": 1.2419218955996676e-06, | |
| "loss": 0.0162, | |
| "reward": 15.854687571525574, | |
| "reward_std": 4.764394700527191, | |
| "rewards/accuracy_reward_staging": 1.3979687802493572, | |
| "rewards/format_reward": 0.90625, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 192 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 628.078125, | |
| "epoch": 192.8, | |
| "grad_norm": 1.329564996171061, | |
| "kl": 0.178253173828125, | |
| "learning_rate": 1.2334453638559054e-06, | |
| "loss": 0.0255, | |
| "reward": 13.543750017881393, | |
| "reward_std": 5.472193785011768, | |
| "rewards/accuracy_reward_staging": 1.168437510728836, | |
| "rewards/format_reward": 0.921875, | |
| "rewards/format_reward_staging": 0.9375, | |
| "step": 193 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 657.796875, | |
| "epoch": 193.8, | |
| "grad_norm": 1.8007713245581922, | |
| "kl": 0.198944091796875, | |
| "learning_rate": 1.2249510543438651e-06, | |
| "loss": 0.0516, | |
| "reward": 14.325000017881393, | |
| "reward_std": 4.515300907194614, | |
| "rewards/accuracy_reward_staging": 1.2465624995529652, | |
| "rewards/format_reward": 0.90625, | |
| "rewards/format_reward_staging": 0.953125, | |
| "step": 194 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 623.984375, | |
| "epoch": 194.8, | |
| "grad_norm": 1.2974682020385033, | |
| "kl": 0.1806640625, | |
| "learning_rate": 1.2164396139381029e-06, | |
| "loss": 0.0383, | |
| "reward": 14.945312559604645, | |
| "reward_std": 5.147151567041874, | |
| "rewards/accuracy_reward_staging": 1.308593761175871, | |
| "rewards/format_reward": 0.921875, | |
| "rewards/format_reward_staging": 0.9375, | |
| "step": 195 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 581.5625, | |
| "epoch": 195.8, | |
| "grad_norm": 1.260278322445417, | |
| "kl": 0.18658447265625, | |
| "learning_rate": 1.207911690817759e-06, | |
| "loss": 0.0477, | |
| "reward": 14.231250047683716, | |
| "reward_std": 5.530913561582565, | |
| "rewards/accuracy_reward_staging": 1.2418750207871199, | |
| "rewards/format_reward": 0.875, | |
| "rewards/format_reward_staging": 0.9375, | |
| "step": 196 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 604.90625, | |
| "epoch": 196.8, | |
| "grad_norm": 1.3096099803251504, | |
| "kl": 0.1900634765625, | |
| "learning_rate": 1.1993679344171972e-06, | |
| "loss": 0.0251, | |
| "reward": 15.50156244635582, | |
| "reward_std": 5.51077751070261, | |
| "rewards/accuracy_reward_staging": 1.3610937595367432, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/format_reward_staging": 0.921875, | |
| "step": 197 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 605.453125, | |
| "epoch": 197.8, | |
| "grad_norm": 1.3586492398815606, | |
| "kl": 0.189208984375, | |
| "learning_rate": 1.1908089953765447e-06, | |
| "loss": 0.0523, | |
| "reward": 13.85781279206276, | |
| "reward_std": 4.988245405256748, | |
| "rewards/accuracy_reward_staging": 1.1998437773436308, | |
| "rewards/format_reward": 0.90625, | |
| "rewards/format_reward_staging": 0.953125, | |
| "step": 198 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 635.609375, | |
| "epoch": 198.8, | |
| "grad_norm": 1.340689220119322, | |
| "kl": 0.17425537109375, | |
| "learning_rate": 1.1822355254921476e-06, | |
| "loss": 0.042, | |
| "reward": 15.098437517881393, | |
| "reward_std": 4.625658318400383, | |
| "rewards/accuracy_reward_staging": 1.3145312629640102, | |
| "rewards/format_reward": 0.984375, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 199 | |
| }, | |
| { | |
| "epoch": 199.8, | |
| "grad_norm": 1.38676313511267, | |
| "learning_rate": 1.1736481776669305e-06, | |
| "loss": 0.0078, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 199.8, | |
| "eval_clip_ratio": 0.0, | |
| "eval_completion_length": 617.9625, | |
| "eval_kl": 0.227685546875, | |
| "eval_loss": -0.011267063207924366, | |
| "eval_reward": 15.479999876022339, | |
| "eval_reward_std": 5.917063271999359, | |
| "eval_rewards/accuracy_reward_staging": 1.358000010251999, | |
| "eval_rewards/format_reward": 0.95, | |
| "eval_rewards/format_reward_staging": 0.95, | |
| "eval_runtime": 140.7854, | |
| "eval_samples_per_second": 0.142, | |
| "eval_steps_per_second": 0.036, | |
| "step": 200 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 619.3828125, | |
| "epoch": 200.8, | |
| "grad_norm": 1.2618291004150899, | |
| "kl": 0.187408447265625, | |
| "learning_rate": 1.1650476058606774e-06, | |
| "loss": 0.0638, | |
| "reward": 14.914843901991844, | |
| "reward_std": 5.3753106370568275, | |
| "rewards/accuracy_reward_staging": 1.3063281308859587, | |
| "rewards/format_reward": 0.890625, | |
| "rewards/format_reward_staging": 0.9609375, | |
| "step": 201 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 661.421875, | |
| "epoch": 201.8, | |
| "grad_norm": 1.3037648155909478, | |
| "kl": 0.1773681640625, | |
| "learning_rate": 1.156434465040231e-06, | |
| "loss": 0.089, | |
| "reward": 14.482812568545341, | |
| "reward_std": 5.621522009372711, | |
| "rewards/accuracy_reward_staging": 1.263906242325902, | |
| "rewards/format_reward": 0.9375, | |
| "rewards/format_reward_staging": 0.90625, | |
| "step": 202 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 634.109375, | |
| "epoch": 202.8, | |
| "grad_norm": 7.276649221582544, | |
| "kl": 0.24066162109375, | |
| "learning_rate": 1.1478094111296109e-06, | |
| "loss": 0.0672, | |
| "reward": 14.378125011920929, | |
| "reward_std": 5.686573512852192, | |
| "rewards/accuracy_reward_staging": 1.2518749982118607, | |
| "rewards/format_reward": 0.90625, | |
| "rewards/format_reward_staging": 0.953125, | |
| "step": 203 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 630.71875, | |
| "epoch": 203.8, | |
| "grad_norm": 1.231495456965766, | |
| "kl": 0.2060546875, | |
| "learning_rate": 1.1391731009600653e-06, | |
| "loss": 0.1112, | |
| "reward": 14.843750059604645, | |
| "reward_std": 4.25174543261528, | |
| "rewards/accuracy_reward_staging": 1.298437513411045, | |
| "rewards/format_reward": 0.90625, | |
| "rewards/format_reward_staging": 0.953125, | |
| "step": 204 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 692.5625, | |
| "epoch": 204.8, | |
| "grad_norm": 1.350931922850144, | |
| "kl": 0.224212646484375, | |
| "learning_rate": 1.1305261922200517e-06, | |
| "loss": 0.0238, | |
| "reward": 15.984374910593033, | |
| "reward_std": 5.367278844118118, | |
| "rewards/accuracy_reward_staging": 1.4234375320374966, | |
| "rewards/format_reward": 0.8125, | |
| "rewards/format_reward_staging": 0.9375, | |
| "step": 205 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 690.828125, | |
| "epoch": 205.8, | |
| "grad_norm": 1.318954712841193, | |
| "kl": 0.19384765625, | |
| "learning_rate": 1.1218693434051474e-06, | |
| "loss": 0.0885, | |
| "reward": 16.017187118530273, | |
| "reward_std": 5.8362889885902405, | |
| "rewards/accuracy_reward_staging": 1.41890624538064, | |
| "rewards/format_reward": 0.90625, | |
| "rewards/format_reward_staging": 0.921875, | |
| "step": 206 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 632.796875, | |
| "epoch": 206.8, | |
| "grad_norm": 1.4124473460728744, | |
| "kl": 0.241455078125, | |
| "learning_rate": 1.1132032137679068e-06, | |
| "loss": 0.0533, | |
| "reward": 16.976562440395355, | |
| "reward_std": 5.929606184363365, | |
| "rewards/accuracy_reward_staging": 1.5070312470197678, | |
| "rewards/format_reward": 0.9375, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 207 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 646.421875, | |
| "epoch": 207.8, | |
| "grad_norm": 1.6773375733949765, | |
| "kl": 0.25189208984375, | |
| "learning_rate": 1.1045284632676535e-06, | |
| "loss": 0.0348, | |
| "reward": 14.135937511920929, | |
| "reward_std": 4.714122384786606, | |
| "rewards/accuracy_reward_staging": 1.2307812497019768, | |
| "rewards/format_reward": 0.890625, | |
| "rewards/format_reward_staging": 0.9375, | |
| "step": 208 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 617.140625, | |
| "epoch": 208.8, | |
| "grad_norm": 1.4348854069875467, | |
| "kl": 0.22174072265625, | |
| "learning_rate": 1.095845752520224e-06, | |
| "loss": 0.0495, | |
| "reward": 14.285937666893005, | |
| "reward_std": 5.9770321398973465, | |
| "rewards/accuracy_reward_staging": 1.2473437581211329, | |
| "rewards/format_reward": 0.921875, | |
| "rewards/format_reward_staging": 0.890625, | |
| "step": 209 | |
| }, | |
| { | |
| "epoch": 209.8, | |
| "grad_norm": 1.1605399733788089, | |
| "learning_rate": 1.0871557427476583e-06, | |
| "loss": 0.0552, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 209.8, | |
| "eval_clip_ratio": 0.0, | |
| "eval_completion_length": 655.7875, | |
| "eval_kl": 0.201123046875, | |
| "eval_loss": -0.0035779415629804134, | |
| "eval_reward": 16.239999866485597, | |
| "eval_reward_std": 4.2823482871055605, | |
| "eval_rewards/accuracy_reward_staging": 1.4415000066161157, | |
| "eval_rewards/format_reward": 0.9, | |
| "eval_rewards/format_reward_staging": 0.925, | |
| "eval_runtime": 142.3261, | |
| "eval_samples_per_second": 0.141, | |
| "eval_steps_per_second": 0.035, | |
| "step": 210 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 646.625, | |
| "epoch": 210.8, | |
| "grad_norm": 1.3085478801780013, | |
| "kl": 0.21307373046875, | |
| "learning_rate": 1.078459095727845e-06, | |
| "loss": 0.0801, | |
| "reward": 14.151562541723251, | |
| "reward_std": 4.665051084011793, | |
| "rewards/accuracy_reward_staging": 1.2393749924376607, | |
| "rewards/format_reward": 0.84375, | |
| "rewards/format_reward_staging": 0.9140625, | |
| "step": 211 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 627.609375, | |
| "epoch": 211.8, | |
| "grad_norm": 1.3675217879957127, | |
| "kl": 0.22113037109375, | |
| "learning_rate": 1.069756473744125e-06, | |
| "loss": 0.0458, | |
| "reward": 15.82500010728836, | |
| "reward_std": 5.255921743810177, | |
| "rewards/accuracy_reward_staging": 1.4075000062584877, | |
| "rewards/format_reward": 0.8125, | |
| "rewards/format_reward_staging": 0.9375, | |
| "step": 212 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 617.390625, | |
| "epoch": 212.8, | |
| "grad_norm": 1.380807253404178, | |
| "kl": 0.20758056640625, | |
| "learning_rate": 1.061048539534857e-06, | |
| "loss": 0.0209, | |
| "reward": 15.371874898672104, | |
| "reward_std": 6.567200765013695, | |
| "rewards/accuracy_reward_staging": 1.3559375293552876, | |
| "rewards/format_reward": 0.890625, | |
| "rewards/format_reward_staging": 0.921875, | |
| "step": 213 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 620.953125, | |
| "epoch": 213.8, | |
| "grad_norm": 1.3902160353507123, | |
| "kl": 0.21124267578125, | |
| "learning_rate": 1.052335956242944e-06, | |
| "loss": 0.0436, | |
| "reward": 15.621874839067459, | |
| "reward_std": 5.122038297355175, | |
| "rewards/accuracy_reward_staging": 1.382499996572733, | |
| "rewards/format_reward": 0.84375, | |
| "rewards/format_reward_staging": 0.953125, | |
| "step": 214 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 655.359375, | |
| "epoch": 214.8, | |
| "grad_norm": 1.3806424551742382, | |
| "kl": 0.22705078125, | |
| "learning_rate": 1.043619387365336e-06, | |
| "loss": -0.0022, | |
| "reward": 13.870312362909317, | |
| "reward_std": 5.269171215593815, | |
| "rewards/accuracy_reward_staging": 1.2057812418788671, | |
| "rewards/format_reward": 0.90625, | |
| "rewards/format_reward_staging": 0.90625, | |
| "step": 215 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 655.09375, | |
| "epoch": 215.8, | |
| "grad_norm": 2.656219707386371, | |
| "kl": 0.2349853515625, | |
| "learning_rate": 1.034899496702501e-06, | |
| "loss": 0.0562, | |
| "reward": 14.140625029802322, | |
| "reward_std": 5.58522791415453, | |
| "rewards/accuracy_reward_staging": 1.2312500067055225, | |
| "rewards/format_reward": 0.890625, | |
| "rewards/format_reward_staging": 0.9375, | |
| "step": 216 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 622.8125, | |
| "epoch": 216.8, | |
| "grad_norm": 1.5209208648591772, | |
| "kl": 0.20660400390625, | |
| "learning_rate": 1.0261769483078732e-06, | |
| "loss": 0.0816, | |
| "reward": 14.775000303983688, | |
| "reward_std": 4.954350218176842, | |
| "rewards/accuracy_reward_staging": 1.2931250091642141, | |
| "rewards/format_reward": 0.875, | |
| "rewards/format_reward_staging": 0.96875, | |
| "step": 217 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 650.234375, | |
| "epoch": 217.8, | |
| "grad_norm": 1.55208660193829, | |
| "kl": 0.210693359375, | |
| "learning_rate": 1.0174524064372837e-06, | |
| "loss": 0.0744, | |
| "reward": 14.27812522649765, | |
| "reward_std": 4.545742444694042, | |
| "rewards/accuracy_reward_staging": 1.2575000151991844, | |
| "rewards/format_reward": 0.78125, | |
| "rewards/format_reward_staging": 0.921875, | |
| "step": 218 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 617.234375, | |
| "epoch": 218.8, | |
| "grad_norm": 1.3928599706734481, | |
| "kl": 0.20306396484375, | |
| "learning_rate": 1.0087265354983738e-06, | |
| "loss": 0.0865, | |
| "reward": 14.462500095367432, | |
| "reward_std": 5.321807600557804, | |
| "rewards/accuracy_reward_staging": 1.2571874894201756, | |
| "rewards/format_reward": 0.90625, | |
| "rewards/format_reward_staging": 0.984375, | |
| "step": 219 | |
| }, | |
| { | |
| "epoch": 219.8, | |
| "grad_norm": 1.5191361705497182, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0929, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 219.8, | |
| "eval_clip_ratio": 0.0, | |
| "eval_completion_length": 621.85, | |
| "eval_kl": 0.18828125, | |
| "eval_loss": 0.07175321877002716, | |
| "eval_reward": 15.787499904632568, | |
| "eval_reward_std": 4.991747093200684, | |
| "eval_rewards/accuracy_reward_staging": 1.39125002771616, | |
| "eval_rewards/format_reward": 0.925, | |
| "eval_rewards/format_reward_staging": 0.95, | |
| "eval_runtime": 143.7256, | |
| "eval_samples_per_second": 0.139, | |
| "eval_steps_per_second": 0.035, | |
| "step": 220 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 626.6953125, | |
| "epoch": 220.8, | |
| "grad_norm": 1.1979189103689327, | |
| "kl": 0.20904541015625, | |
| "learning_rate": 9.912734645016263e-07, | |
| "loss": 0.0653, | |
| "reward": 14.830468773841858, | |
| "reward_std": 4.90237557888031, | |
| "rewards/accuracy_reward_staging": 1.3041406441479921, | |
| "rewards/format_reward": 0.8515625, | |
| "rewards/format_reward_staging": 0.9375, | |
| "step": 221 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 618.171875, | |
| "epoch": 221.8, | |
| "grad_norm": 1.3245626778227455, | |
| "kl": 0.19866943359375, | |
| "learning_rate": 9.825475935627165e-07, | |
| "loss": 0.0378, | |
| "reward": 15.185937464237213, | |
| "reward_std": 6.512602657079697, | |
| "rewards/accuracy_reward_staging": 1.3451562821865082, | |
| "rewards/format_reward": 0.8125, | |
| "rewards/format_reward_staging": 0.921875, | |
| "step": 222 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 655.546875, | |
| "epoch": 222.8, | |
| "grad_norm": 1.2627576071000894, | |
| "kl": 0.22076416015625, | |
| "learning_rate": 9.73823051692127e-07, | |
| "loss": 0.0823, | |
| "reward": 14.023437589406967, | |
| "reward_std": 5.150766499340534, | |
| "rewards/accuracy_reward_staging": 1.2210937440395355, | |
| "rewards/format_reward": 0.875, | |
| "rewards/format_reward_staging": 0.9375, | |
| "step": 223 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 630.046875, | |
| "epoch": 223.8, | |
| "grad_norm": 1.4630069125653442, | |
| "kl": 0.22393798828125, | |
| "learning_rate": 9.651005032974993e-07, | |
| "loss": 0.1163, | |
| "reward": 15.462500154972076, | |
| "reward_std": 4.448259741067886, | |
| "rewards/accuracy_reward_staging": 1.3556250110268593, | |
| "rewards/format_reward": 0.921875, | |
| "rewards/format_reward_staging": 0.984375, | |
| "step": 224 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 626.4375, | |
| "epoch": 224.8, | |
| "grad_norm": 1.3737324086080591, | |
| "kl": 0.24346923828125, | |
| "learning_rate": 9.56380612634664e-07, | |
| "loss": 0.073, | |
| "reward": 13.575000047683716, | |
| "reward_std": 5.982485473155975, | |
| "rewards/accuracy_reward_staging": 1.174687497317791, | |
| "rewards/format_reward": 0.890625, | |
| "rewards/format_reward_staging": 0.9375, | |
| "step": 225 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 696.59375, | |
| "epoch": 225.8, | |
| "grad_norm": 1.4368525693959429, | |
| "kl": 0.2528076171875, | |
| "learning_rate": 9.476640437570561e-07, | |
| "loss": 0.0855, | |
| "reward": 14.559374749660492, | |
| "reward_std": 6.478231497108936, | |
| "rewards/accuracy_reward_staging": 1.2887500040233135, | |
| "rewards/format_reward": 0.78125, | |
| "rewards/format_reward_staging": 0.890625, | |
| "step": 226 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 655.390625, | |
| "epoch": 226.8, | |
| "grad_norm": 1.3423833107341736, | |
| "kl": 0.2432861328125, | |
| "learning_rate": 9.38951460465143e-07, | |
| "loss": 0.0918, | |
| "reward": 13.806249856948853, | |
| "reward_std": 5.268295802175999, | |
| "rewards/accuracy_reward_staging": 1.2056250125169754, | |
| "rewards/format_reward": 0.828125, | |
| "rewards/format_reward_staging": 0.921875, | |
| "step": 227 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 720.40625, | |
| "epoch": 227.8, | |
| "grad_norm": 2.0887623964966555, | |
| "kl": 0.26153564453125, | |
| "learning_rate": 9.302435262558747e-07, | |
| "loss": 0.0741, | |
| "reward": 14.531249985098839, | |
| "reward_std": 6.193335264921188, | |
| "rewards/accuracy_reward_staging": 1.2843750081956387, | |
| "rewards/format_reward": 0.84375, | |
| "rewards/format_reward_staging": 0.84375, | |
| "step": 228 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 699.34375, | |
| "epoch": 228.8, | |
| "grad_norm": 1.5817414267449967, | |
| "kl": 0.25372314453125, | |
| "learning_rate": 9.215409042721551e-07, | |
| "loss": 0.1477, | |
| "reward": 14.853124976158142, | |
| "reward_std": 7.3317131996154785, | |
| "rewards/accuracy_reward_staging": 1.3150000125169754, | |
| "rewards/format_reward": 0.8125, | |
| "rewards/format_reward_staging": 0.890625, | |
| "step": 229 | |
| }, | |
| { | |
| "epoch": 229.8, | |
| "grad_norm": 1.4707093666889888, | |
| "learning_rate": 9.128442572523417e-07, | |
| "loss": 0.1039, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 229.8, | |
| "eval_clip_ratio": 0.0, | |
| "eval_completion_length": 699.725, | |
| "eval_kl": 0.27763671875, | |
| "eval_loss": 0.13621756434440613, | |
| "eval_reward": 15.102499961853027, | |
| "eval_reward_std": 5.6639987349510195, | |
| "eval_rewards/accuracy_reward_staging": 1.337749996781349, | |
| "eval_rewards/format_reward": 0.825, | |
| "eval_rewards/format_reward_staging": 0.9, | |
| "eval_runtime": 176.4013, | |
| "eval_samples_per_second": 0.113, | |
| "eval_steps_per_second": 0.028, | |
| "step": 230 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 750.5234375, | |
| "epoch": 230.8, | |
| "grad_norm": 2.5022162810953015, | |
| "kl": 0.310821533203125, | |
| "learning_rate": 9.04154247479776e-07, | |
| "loss": 0.0621, | |
| "reward": 13.921874970197678, | |
| "reward_std": 5.641379028558731, | |
| "rewards/accuracy_reward_staging": 1.2312500048428774, | |
| "rewards/format_reward": 0.7421875, | |
| "rewards/format_reward_staging": 0.8671875, | |
| "step": 231 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 737.5, | |
| "epoch": 231.8, | |
| "grad_norm": 1.3203764343240048, | |
| "kl": 0.2919921875, | |
| "learning_rate": 8.954715367323466e-07, | |
| "loss": 0.1254, | |
| "reward": 13.312500059604645, | |
| "reward_std": 5.994187116622925, | |
| "rewards/accuracy_reward_staging": 1.1750000100582838, | |
| "rewards/format_reward": 0.703125, | |
| "rewards/format_reward_staging": 0.859375, | |
| "step": 232 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 663.015625, | |
| "epoch": 232.8, | |
| "grad_norm": 1.6833018973839629, | |
| "kl": 0.306640625, | |
| "learning_rate": 8.867967862320933e-07, | |
| "loss": 0.0829, | |
| "reward": 11.451562643051147, | |
| "reward_std": 6.721396386623383, | |
| "rewards/accuracy_reward_staging": 0.9779687598347664, | |
| "rewards/format_reward": 0.875, | |
| "rewards/format_reward_staging": 0.796875, | |
| "step": 233 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 745.671875, | |
| "epoch": 233.8, | |
| "grad_norm": 1.5784019252688062, | |
| "kl": 0.3076171875, | |
| "learning_rate": 8.781306565948526e-07, | |
| "loss": 0.0822, | |
| "reward": 13.026562601327896, | |
| "reward_std": 4.835877507925034, | |
| "rewards/accuracy_reward_staging": 1.143281283788383, | |
| "rewards/format_reward": 0.78125, | |
| "rewards/format_reward_staging": 0.8125, | |
| "step": 234 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 716.4375, | |
| "epoch": 234.8, | |
| "grad_norm": 1.5923082293463926, | |
| "kl": 0.3375244140625, | |
| "learning_rate": 8.694738077799486e-07, | |
| "loss": 0.0811, | |
| "reward": 13.98749989271164, | |
| "reward_std": 7.312740258872509, | |
| "rewards/accuracy_reward_staging": 1.2425000295042992, | |
| "rewards/format_reward": 0.796875, | |
| "rewards/format_reward_staging": 0.765625, | |
| "step": 235 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 728.125, | |
| "epoch": 235.8, | |
| "grad_norm": 1.520271972538422, | |
| "kl": 0.3232421875, | |
| "learning_rate": 8.608268990399348e-07, | |
| "loss": 0.1051, | |
| "reward": 13.317187458276749, | |
| "reward_std": 7.371540606021881, | |
| "rewards/accuracy_reward_staging": 1.1739062573760748, | |
| "rewards/format_reward": 0.71875, | |
| "rewards/format_reward_staging": 0.859375, | |
| "step": 236 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 804.15625, | |
| "epoch": 236.8, | |
| "grad_norm": 1.3584766642039745, | |
| "kl": 0.3104248046875, | |
| "learning_rate": 8.521905888703893e-07, | |
| "loss": 0.1753, | |
| "reward": 12.36562493443489, | |
| "reward_std": 7.354695707559586, | |
| "rewards/accuracy_reward_staging": 1.0818749964237213, | |
| "rewards/format_reward": 0.78125, | |
| "rewards/format_reward_staging": 0.765625, | |
| "step": 237 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 691.6875, | |
| "epoch": 237.8, | |
| "grad_norm": 1.4062373976771998, | |
| "kl": 0.3116455078125, | |
| "learning_rate": 8.435655349597689e-07, | |
| "loss": 0.1024, | |
| "reward": 13.564062476158142, | |
| "reward_std": 6.342557780444622, | |
| "rewards/accuracy_reward_staging": 1.2001562491059303, | |
| "rewards/format_reward": 0.796875, | |
| "rewards/format_reward_staging": 0.765625, | |
| "step": 238 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 825.25, | |
| "epoch": 238.8, | |
| "grad_norm": 1.3912617565210863, | |
| "kl": 0.326416015625, | |
| "learning_rate": 8.349523941393223e-07, | |
| "loss": 0.1304, | |
| "reward": 13.695312589406967, | |
| "reward_std": 7.123799741268158, | |
| "rewards/accuracy_reward_staging": 1.211718775331974, | |
| "rewards/format_reward": 0.78125, | |
| "rewards/format_reward_staging": 0.796875, | |
| "step": 239 | |
| }, | |
| { | |
| "epoch": 239.8, | |
| "grad_norm": 1.5595564499799097, | |
| "learning_rate": 8.263518223330696e-07, | |
| "loss": 0.2716, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 239.8, | |
| "eval_clip_ratio": 0.0, | |
| "eval_completion_length": 710.4875, | |
| "eval_kl": 0.301416015625, | |
| "eval_loss": 0.1302235871553421, | |
| "eval_reward": 13.53874992132187, | |
| "eval_reward_std": 5.90446172952652, | |
| "eval_rewards/accuracy_reward_staging": 1.1926250100135802, | |
| "eval_rewards/format_reward": 0.825, | |
| "eval_rewards/format_reward_staging": 0.7875, | |
| "eval_runtime": 172.1092, | |
| "eval_samples_per_second": 0.116, | |
| "eval_steps_per_second": 0.029, | |
| "step": 240 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 820.21875, | |
| "epoch": 240.8, | |
| "grad_norm": 1.3594222983218733, | |
| "kl": 0.29791259765625, | |
| "learning_rate": 8.177644745078525e-07, | |
| "loss": 0.1216, | |
| "reward": 13.201562486588955, | |
| "reward_std": 6.912581101059914, | |
| "rewards/accuracy_reward_staging": 1.1646875077858567, | |
| "rewards/format_reward": 0.7734375, | |
| "rewards/format_reward_staging": 0.78125, | |
| "step": 241 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 714.53125, | |
| "epoch": 241.8, | |
| "grad_norm": 1.3736471708376299, | |
| "kl": 0.31427001953125, | |
| "learning_rate": 8.091910046234551e-07, | |
| "loss": 0.1539, | |
| "reward": 14.295312345027924, | |
| "reward_std": 5.410611517727375, | |
| "rewards/accuracy_reward_staging": 1.2639062507078052, | |
| "rewards/format_reward": 0.84375, | |
| "rewards/format_reward_staging": 0.8125, | |
| "step": 242 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 731.6875, | |
| "epoch": 242.8, | |
| "grad_norm": 1.4519054784859347, | |
| "kl": 0.33892822265625, | |
| "learning_rate": 8.006320655828029e-07, | |
| "loss": 0.1532, | |
| "reward": 11.943750023841858, | |
| "reward_std": 7.561622552573681, | |
| "rewards/accuracy_reward_staging": 1.0662500150501728, | |
| "rewards/format_reward": 0.640625, | |
| "rewards/format_reward_staging": 0.640625, | |
| "step": 243 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 807.59375, | |
| "epoch": 243.8, | |
| "grad_norm": 1.4438718122124043, | |
| "kl": 0.340576171875, | |
| "learning_rate": 7.920883091822408e-07, | |
| "loss": 0.1606, | |
| "reward": 10.796874925494194, | |
| "reward_std": 7.172753885388374, | |
| "rewards/accuracy_reward_staging": 0.9265625104308128, | |
| "rewards/format_reward": 0.8125, | |
| "rewards/format_reward_staging": 0.71875, | |
| "step": 244 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 755.296875, | |
| "epoch": 244.8, | |
| "grad_norm": 1.5583845174659146, | |
| "kl": 0.3447265625, | |
| "learning_rate": 7.835603860618971e-07, | |
| "loss": 0.2097, | |
| "reward": 11.843750149011612, | |
| "reward_std": 7.400421276688576, | |
| "rewards/accuracy_reward_staging": 1.0296875108033419, | |
| "rewards/format_reward": 0.796875, | |
| "rewards/format_reward_staging": 0.75, | |
| "step": 245 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 759.03125, | |
| "epoch": 245.8, | |
| "grad_norm": 1.3528957695912824, | |
| "kl": 0.357666015625, | |
| "learning_rate": 7.750489456561351e-07, | |
| "loss": 0.1164, | |
| "reward": 11.779687464237213, | |
| "reward_std": 6.353302523493767, | |
| "rewards/accuracy_reward_staging": 1.0295312507078052, | |
| "rewards/format_reward": 0.765625, | |
| "rewards/format_reward_staging": 0.71875, | |
| "step": 246 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 848.734375, | |
| "epoch": 246.8, | |
| "grad_norm": 1.554460288339238, | |
| "kl": 0.3973388671875, | |
| "learning_rate": 7.665546361440949e-07, | |
| "loss": 0.1267, | |
| "reward": 12.23906247317791, | |
| "reward_std": 7.234898805618286, | |
| "rewards/accuracy_reward_staging": 1.0817187651991844, | |
| "rewards/format_reward": 0.703125, | |
| "rewards/format_reward_staging": 0.71875, | |
| "step": 247 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 822.703125, | |
| "epoch": 247.8, | |
| "grad_norm": 1.433844235366676, | |
| "kl": 0.3592529296875, | |
| "learning_rate": 7.580781044003324e-07, | |
| "loss": 0.0235, | |
| "reward": 14.143749997019768, | |
| "reward_std": 5.741221696138382, | |
| "rewards/accuracy_reward_staging": 1.2581250127404928, | |
| "rewards/format_reward": 0.765625, | |
| "rewards/format_reward_staging": 0.796875, | |
| "step": 248 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 903.109375, | |
| "epoch": 248.8, | |
| "grad_norm": 1.3566149279604043, | |
| "kl": 0.31982421875, | |
| "learning_rate": 7.496199959455583e-07, | |
| "loss": 0.2022, | |
| "reward": 11.91718764603138, | |
| "reward_std": 6.288423582911491, | |
| "rewards/accuracy_reward_staging": 1.0448437514714897, | |
| "rewards/format_reward": 0.71875, | |
| "rewards/format_reward_staging": 0.75, | |
| "step": 249 | |
| }, | |
| { | |
| "epoch": 249.8, | |
| "grad_norm": 3.5268979600947743, | |
| "learning_rate": 7.411809548974791e-07, | |
| "loss": 0.2181, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 249.8, | |
| "eval_clip_ratio": 0.0, | |
| "eval_completion_length": 863.7875, | |
| "eval_kl": 0.37177734375, | |
| "eval_loss": 0.22468861937522888, | |
| "eval_reward": 12.728750014305115, | |
| "eval_reward_std": 6.379789352416992, | |
| "eval_rewards/accuracy_reward_staging": 1.1216249838471413, | |
| "eval_rewards/format_reward": 0.775, | |
| "eval_rewards/format_reward_staging": 0.7375, | |
| "eval_runtime": 243.2396, | |
| "eval_samples_per_second": 0.082, | |
| "eval_steps_per_second": 0.021, | |
| "step": 250 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 816.5703125, | |
| "epoch": 250.8, | |
| "grad_norm": 1.3710303728682416, | |
| "kl": 0.4068603515625, | |
| "learning_rate": 7.327616239217431e-07, | |
| "loss": 0.176, | |
| "reward": 12.385937452316284, | |
| "reward_std": 6.818997707217932, | |
| "rewards/accuracy_reward_staging": 1.0815625092945993, | |
| "rewards/format_reward": 0.796875, | |
| "rewards/format_reward_staging": 0.7734375, | |
| "step": 251 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 811.671875, | |
| "epoch": 251.8, | |
| "grad_norm": 1.361752204741976, | |
| "kl": 0.39990234375, | |
| "learning_rate": 7.243626441830009e-07, | |
| "loss": 0.1261, | |
| "reward": 11.423437476158142, | |
| "reward_std": 7.801801845431328, | |
| "rewards/accuracy_reward_staging": 0.9985937401652336, | |
| "rewards/format_reward": 0.78125, | |
| "rewards/format_reward_staging": 0.65625, | |
| "step": 252 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 796.375, | |
| "epoch": 252.8, | |
| "grad_norm": 1.4560468845981351, | |
| "kl": 0.3995361328125, | |
| "learning_rate": 7.159846552960773e-07, | |
| "loss": 0.1045, | |
| "reward": 12.606250077486038, | |
| "reward_std": 7.680657230317593, | |
| "rewards/accuracy_reward_staging": 1.107500022277236, | |
| "rewards/format_reward": 0.8125, | |
| "rewards/format_reward_staging": 0.71875, | |
| "step": 253 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 951.03125, | |
| "epoch": 253.8, | |
| "grad_norm": 1.411783783971026, | |
| "kl": 0.36572265625, | |
| "learning_rate": 7.076282952772633e-07, | |
| "loss": 0.2697, | |
| "reward": 10.959374994039536, | |
| "reward_std": 8.183534801006317, | |
| "rewards/accuracy_reward_staging": 0.9631250090897083, | |
| "rewards/format_reward": 0.75, | |
| "rewards/format_reward_staging": 0.578125, | |
| "step": 254 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 828.109375, | |
| "epoch": 254.8, | |
| "grad_norm": 1.573986411740295, | |
| "kl": 0.385498046875, | |
| "learning_rate": 6.992942004957269e-07, | |
| "loss": 0.2294, | |
| "reward": 11.017187476158142, | |
| "reward_std": 8.27015207707882, | |
| "rewards/accuracy_reward_staging": 0.9517187681049109, | |
| "rewards/format_reward": 0.78125, | |
| "rewards/format_reward_staging": 0.71875, | |
| "step": 255 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 896.015625, | |
| "epoch": 255.8, | |
| "grad_norm": 1.5503448731327603, | |
| "kl": 0.4189453125, | |
| "learning_rate": 6.909830056250526e-07, | |
| "loss": 0.233, | |
| "reward": 12.285937294363976, | |
| "reward_std": 7.363780289888382, | |
| "rewards/accuracy_reward_staging": 1.081718772649765, | |
| "rewards/format_reward": 0.75, | |
| "rewards/format_reward_staging": 0.71875, | |
| "step": 256 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 891.390625, | |
| "epoch": 256.8, | |
| "grad_norm": 1.5949891985949471, | |
| "kl": 0.38238525390625, | |
| "learning_rate": 6.82695343594908e-07, | |
| "loss": 0.2359, | |
| "reward": 13.309375122189522, | |
| "reward_std": 7.619817182421684, | |
| "rewards/accuracy_reward_staging": 1.1700000204145908, | |
| "rewards/format_reward": 0.875, | |
| "rewards/format_reward_staging": 0.734375, | |
| "step": 257 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 892.046875, | |
| "epoch": 257.8, | |
| "grad_norm": 1.6964810618377213, | |
| "kl": 0.375, | |
| "learning_rate": 6.744318455428435e-07, | |
| "loss": 0.3474, | |
| "reward": 11.334375008940697, | |
| "reward_std": 7.634813725948334, | |
| "rewards/accuracy_reward_staging": 0.9834375139325857, | |
| "rewards/format_reward": 0.8125, | |
| "rewards/format_reward_staging": 0.6875, | |
| "step": 258 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 723.828125, | |
| "epoch": 258.8, | |
| "grad_norm": 5.6325638555717505, | |
| "kl": 0.4317626953125, | |
| "learning_rate": 6.661931407662291e-07, | |
| "loss": 0.1424, | |
| "reward": 12.10312506556511, | |
| "reward_std": 7.089647740125656, | |
| "rewards/accuracy_reward_staging": 1.0399999842047691, | |
| "rewards/format_reward": 0.890625, | |
| "rewards/format_reward_staging": 0.8125, | |
| "step": 259 | |
| }, | |
| { | |
| "epoch": 259.8, | |
| "grad_norm": 1.4896714438939402, | |
| "learning_rate": 6.579798566743313e-07, | |
| "loss": 0.2349, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 259.8, | |
| "eval_clip_ratio": 0.0, | |
| "eval_completion_length": 904.775, | |
| "eval_kl": 0.505517578125, | |
| "eval_loss": 0.21720127761363983, | |
| "eval_reward": 11.496249973773956, | |
| "eval_reward_std": 7.962953209877014, | |
| "eval_rewards/accuracy_reward_staging": 1.0021250128746033, | |
| "eval_rewards/format_reward": 0.825, | |
| "eval_rewards/format_reward_staging": 0.65, | |
| "eval_runtime": 252.62, | |
| "eval_samples_per_second": 0.079, | |
| "eval_steps_per_second": 0.02, | |
| "step": 260 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 851.5546875, | |
| "epoch": 260.8, | |
| "grad_norm": 1.4904521266628086, | |
| "kl": 0.43194580078125, | |
| "learning_rate": 6.497926187405324e-07, | |
| "loss": 0.1751, | |
| "reward": 12.633593738079071, | |
| "reward_std": 7.350790940225124, | |
| "rewards/accuracy_reward_staging": 1.1102343881502748, | |
| "rewards/format_reward": 0.8046875, | |
| "rewards/format_reward_staging": 0.7265625, | |
| "step": 261 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 854.90625, | |
| "epoch": 261.8, | |
| "grad_norm": 1.329154678834471, | |
| "kl": 0.3721923828125, | |
| "learning_rate": 6.416320504546997e-07, | |
| "loss": 0.1341, | |
| "reward": 12.78281256556511, | |
| "reward_std": 6.5289479941129684, | |
| "rewards/accuracy_reward_staging": 1.1282812533900142, | |
| "rewards/format_reward": 0.765625, | |
| "rewards/format_reward_staging": 0.734375, | |
| "step": 262 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 775.671875, | |
| "epoch": 262.8, | |
| "grad_norm": 1.7527252884904974, | |
| "kl": 0.3619384765625, | |
| "learning_rate": 6.334987732757028e-07, | |
| "loss": 0.2659, | |
| "reward": 11.443749904632568, | |
| "reward_std": 6.490789204835892, | |
| "rewards/accuracy_reward_staging": 0.9881250048056245, | |
| "rewards/format_reward": 0.828125, | |
| "rewards/format_reward_staging": 0.734375, | |
| "step": 263 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 837.03125, | |
| "epoch": 263.8, | |
| "grad_norm": 1.5611285133893833, | |
| "kl": 0.426025390625, | |
| "learning_rate": 6.253934065840879e-07, | |
| "loss": 0.1938, | |
| "reward": 10.160937517881393, | |
| "reward_std": 6.217289835214615, | |
| "rewards/accuracy_reward_staging": 0.8582812617532909, | |
| "rewards/format_reward": 0.875, | |
| "rewards/format_reward_staging": 0.703125, | |
| "step": 264 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 982.140625, | |
| "epoch": 264.8, | |
| "grad_norm": 1.6308836635992412, | |
| "kl": 0.4444580078125, | |
| "learning_rate": 6.173165676349102e-07, | |
| "loss": 0.3391, | |
| "reward": 11.560937523841858, | |
| "reward_std": 8.43397456407547, | |
| "rewards/accuracy_reward_staging": 1.0092187635600567, | |
| "rewards/format_reward": 0.78125, | |
| "rewards/format_reward_staging": 0.6875, | |
| "step": 265 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 917.21875, | |
| "epoch": 265.8, | |
| "grad_norm": 8.538915516124892, | |
| "kl": 0.4678955078125, | |
| "learning_rate": 6.092688715107263e-07, | |
| "loss": 0.1789, | |
| "reward": 12.584374994039536, | |
| "reward_std": 5.365057937800884, | |
| "rewards/accuracy_reward_staging": 1.1021875068545341, | |
| "rewards/format_reward": 0.859375, | |
| "rewards/format_reward_staging": 0.703125, | |
| "step": 266 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 978.140625, | |
| "epoch": 266.8, | |
| "grad_norm": 11.685381653606097, | |
| "kl": 0.5882568359375, | |
| "learning_rate": 6.012509310747538e-07, | |
| "loss": 0.191, | |
| "reward": 10.77499994635582, | |
| "reward_std": 7.996917471289635, | |
| "rewards/accuracy_reward_staging": 0.9415625259280205, | |
| "rewards/format_reward": 0.75, | |
| "rewards/format_reward_staging": 0.609375, | |
| "step": 267 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 925.015625, | |
| "epoch": 267.8, | |
| "grad_norm": 1.4792658565609935, | |
| "kl": 0.4212646484375, | |
| "learning_rate": 5.932633569241999e-07, | |
| "loss": 0.1863, | |
| "reward": 11.715624958276749, | |
| "reward_std": 6.7979661747813225, | |
| "rewards/accuracy_reward_staging": 1.0200000181794167, | |
| "rewards/format_reward": 0.84375, | |
| "rewards/format_reward_staging": 0.671875, | |
| "step": 268 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 845.140625, | |
| "epoch": 268.8, | |
| "grad_norm": 5.20673071015397, | |
| "kl": 0.4212646484375, | |
| "learning_rate": 5.853067573437611e-07, | |
| "loss": 0.3561, | |
| "reward": 13.192187458276749, | |
| "reward_std": 7.627192087471485, | |
| "rewards/accuracy_reward_staging": 1.164531260728836, | |
| "rewards/format_reward": 0.828125, | |
| "rewards/format_reward_staging": 0.71875, | |
| "step": 269 | |
| }, | |
| { | |
| "epoch": 269.8, | |
| "grad_norm": 1.3473441966408635, | |
| "learning_rate": 5.773817382593007e-07, | |
| "loss": 0.2184, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 269.8, | |
| "eval_clip_ratio": 0.0, | |
| "eval_completion_length": 896.7625, | |
| "eval_kl": 0.446728515625, | |
| "eval_loss": 0.2557651400566101, | |
| "eval_reward": 11.58375017642975, | |
| "eval_reward_std": 7.534060525894165, | |
| "eval_rewards/accuracy_reward_staging": 1.0096250101923943, | |
| "eval_rewards/format_reward": 0.8, | |
| "eval_rewards/format_reward_staging": 0.6875, | |
| "eval_runtime": 257.488, | |
| "eval_samples_per_second": 0.078, | |
| "eval_steps_per_second": 0.019, | |
| "step": 270 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 945.8359375, | |
| "epoch": 270.8, | |
| "grad_norm": 1.8166241750746335, | |
| "kl": 0.4503173828125, | |
| "learning_rate": 5.694889031917046e-07, | |
| "loss": 0.2195, | |
| "reward": 10.942968711256981, | |
| "reward_std": 7.070375669747591, | |
| "rewards/accuracy_reward_staging": 0.9497656342573464, | |
| "rewards/format_reward": 0.7890625, | |
| "rewards/format_reward_staging": 0.65625, | |
| "step": 271 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 860.984375, | |
| "epoch": 271.8, | |
| "grad_norm": 1.595405733714329, | |
| "kl": 0.455810546875, | |
| "learning_rate": 5.616288532109224e-07, | |
| "loss": 0.2539, | |
| "reward": 11.723437696695328, | |
| "reward_std": 7.531994827091694, | |
| "rewards/accuracy_reward_staging": 1.0301562547683716, | |
| "rewards/format_reward": 0.75, | |
| "rewards/format_reward_staging": 0.671875, | |
| "step": 272 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 892.765625, | |
| "epoch": 272.8, | |
| "grad_norm": 1.9592298387429952, | |
| "kl": 0.4637451171875, | |
| "learning_rate": 5.538021868901912e-07, | |
| "loss": 0.2683, | |
| "reward": 10.093750059604645, | |
| "reward_std": 6.664774626493454, | |
| "rewards/accuracy_reward_staging": 0.8687500087544322, | |
| "rewards/format_reward": 0.796875, | |
| "rewards/format_reward_staging": 0.609375, | |
| "step": 273 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 958.265625, | |
| "epoch": 273.8, | |
| "grad_norm": 98.566029551543, | |
| "kl": 1.8087158203125, | |
| "learning_rate": 5.460095002604532e-07, | |
| "loss": 0.3116, | |
| "reward": 9.573437586426735, | |
| "reward_std": 7.937933571636677, | |
| "rewards/accuracy_reward_staging": 0.8276562560349703, | |
| "rewards/format_reward": 0.71875, | |
| "rewards/format_reward_staging": 0.578125, | |
| "step": 274 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 915.875, | |
| "epoch": 274.8, | |
| "grad_norm": 1.54560131594488, | |
| "kl": 0.4423828125, | |
| "learning_rate": 5.382513867649663e-07, | |
| "loss": 0.2622, | |
| "reward": 10.931249961256981, | |
| "reward_std": 7.524611636996269, | |
| "rewards/accuracy_reward_staging": 0.9571875012479722, | |
| "rewards/format_reward": 0.75, | |
| "rewards/format_reward_staging": 0.609375, | |
| "step": 275 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 856.765625, | |
| "epoch": 275.8, | |
| "grad_norm": 3.0239354302219548, | |
| "kl": 0.5811767578125, | |
| "learning_rate": 5.305284372141095e-07, | |
| "loss": 0.2451, | |
| "reward": 11.835937544703484, | |
| "reward_std": 6.662468932569027, | |
| "rewards/accuracy_reward_staging": 1.039843776728958, | |
| "rewards/format_reward": 0.734375, | |
| "rewards/format_reward_staging": 0.703125, | |
| "step": 276 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 932.6875, | |
| "epoch": 276.8, | |
| "grad_norm": 1.616305493128834, | |
| "kl": 0.538818359375, | |
| "learning_rate": 5.228412397403915e-07, | |
| "loss": 0.2794, | |
| "reward": 9.204687595367432, | |
| "reward_std": 8.746634840965271, | |
| "rewards/accuracy_reward_staging": 0.7923437561839819, | |
| "rewards/format_reward": 0.734375, | |
| "rewards/format_reward_staging": 0.546875, | |
| "step": 277 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 853.65625, | |
| "epoch": 277.8, | |
| "grad_norm": 1.5898137384166846, | |
| "kl": 0.4730224609375, | |
| "learning_rate": 5.15190379753663e-07, | |
| "loss": 0.2771, | |
| "reward": 9.235937505960464, | |
| "reward_std": 8.327273309230804, | |
| "rewards/accuracy_reward_staging": 0.786093763075769, | |
| "rewards/format_reward": 0.796875, | |
| "rewards/format_reward_staging": 0.578125, | |
| "step": 278 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 823.4375, | |
| "epoch": 278.8, | |
| "grad_norm": 2.44549487611301, | |
| "kl": 0.52734375, | |
| "learning_rate": 5.07576439896533e-07, | |
| "loss": 0.2175, | |
| "reward": 11.657812714576721, | |
| "reward_std": 8.151460975408554, | |
| "rewards/accuracy_reward_staging": 1.0251562464982271, | |
| "rewards/format_reward": 0.75, | |
| "rewards/format_reward_staging": 0.65625, | |
| "step": 279 | |
| }, | |
| { | |
| "epoch": 279.8, | |
| "grad_norm": 2.0646627929666184, | |
| "learning_rate": 5.000000000000002e-07, | |
| "loss": 0.357, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 279.8, | |
| "eval_clip_ratio": 0.0, | |
| "eval_completion_length": 810.45, | |
| "eval_kl": 0.4798828125, | |
| "eval_loss": 0.1928146332502365, | |
| "eval_reward": 12.850000190734864, | |
| "eval_reward_std": 7.15499917268753, | |
| "eval_rewards/accuracy_reward_staging": 1.1475000128149986, | |
| "eval_rewards/format_reward": 0.725, | |
| "eval_rewards/format_reward_staging": 0.65, | |
| "eval_runtime": 242.0902, | |
| "eval_samples_per_second": 0.083, | |
| "eval_steps_per_second": 0.021, | |
| "step": 280 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 885.203125, | |
| "epoch": 280.8, | |
| "grad_norm": 1.6342686921474878, | |
| "kl": 0.453857421875, | |
| "learning_rate": 4.924616370392961e-07, | |
| "loss": 0.2134, | |
| "reward": 11.285156175494194, | |
| "reward_std": 7.196100067347288, | |
| "rewards/accuracy_reward_staging": 0.9832031358964741, | |
| "rewards/format_reward": 0.8046875, | |
| "rewards/format_reward_staging": 0.6484375, | |
| "step": 281 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 827.796875, | |
| "epoch": 281.8, | |
| "grad_norm": 1.9777507622588804, | |
| "kl": 0.501708984375, | |
| "learning_rate": 4.849619250899458e-07, | |
| "loss": 0.2286, | |
| "reward": 9.923437535762787, | |
| "reward_std": 6.76015942543745, | |
| "rewards/accuracy_reward_staging": 0.8532812669873238, | |
| "rewards/format_reward": 0.75, | |
| "rewards/format_reward_staging": 0.640625, | |
| "step": 282 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 760.53125, | |
| "epoch": 282.8, | |
| "grad_norm": 1.983747741222898, | |
| "kl": 0.5208740234375, | |
| "learning_rate": 4.775014352840512e-07, | |
| "loss": 0.2496, | |
| "reward": 10.876562595367432, | |
| "reward_std": 8.217148587107658, | |
| "rewards/accuracy_reward_staging": 0.9735937742516398, | |
| "rewards/format_reward": 0.609375, | |
| "rewards/format_reward_staging": 0.53125, | |
| "step": 283 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 800.21875, | |
| "epoch": 283.8, | |
| "grad_norm": 9.98641000305957, | |
| "kl": 0.650634765625, | |
| "learning_rate": 4.700807357667952e-07, | |
| "loss": 0.224, | |
| "reward": 10.59218743443489, | |
| "reward_std": 8.216856330633163, | |
| "rewards/accuracy_reward_staging": 0.910781248472631, | |
| "rewards/format_reward": 0.84375, | |
| "rewards/format_reward_staging": 0.640625, | |
| "step": 284 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 724.0625, | |
| "epoch": 284.8, | |
| "grad_norm": 3.1107143716405066, | |
| "kl": 0.58544921875, | |
| "learning_rate": 4.62700391653176e-07, | |
| "loss": 0.1371, | |
| "reward": 12.684374868869781, | |
| "reward_std": 5.954333983361721, | |
| "rewards/accuracy_reward_staging": 1.1121875066310167, | |
| "rewards/format_reward": 0.796875, | |
| "rewards/format_reward_staging": 0.765625, | |
| "step": 285 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 673.5, | |
| "epoch": 285.8, | |
| "grad_norm": 1.6272052880635073, | |
| "kl": 0.486572265625, | |
| "learning_rate": 4.5536096498497287e-07, | |
| "loss": 0.1643, | |
| "reward": 10.56093743443489, | |
| "reward_std": 8.084580287337303, | |
| "rewards/accuracy_reward_staging": 0.9310937505215406, | |
| "rewards/format_reward": 0.6875, | |
| "rewards/format_reward_staging": 0.5625, | |
| "step": 286 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 651.046875, | |
| "epoch": 286.8, | |
| "grad_norm": 44.36285030093844, | |
| "kl": 0.7974853515625, | |
| "learning_rate": 4.480630146879418e-07, | |
| "loss": 0.161, | |
| "reward": 11.729687303304672, | |
| "reward_std": 8.256633162498474, | |
| "rewards/accuracy_reward_staging": 1.02453126385808, | |
| "rewards/format_reward": 0.78125, | |
| "rewards/format_reward_staging": 0.703125, | |
| "step": 287 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 694.03125, | |
| "epoch": 287.8, | |
| "grad_norm": 1.805744925603759, | |
| "kl": 0.5167236328125, | |
| "learning_rate": 4.408070965292533e-07, | |
| "loss": 0.1719, | |
| "reward": 12.934375196695328, | |
| "reward_std": 8.110588558018208, | |
| "rewards/accuracy_reward_staging": 1.149687498807907, | |
| "rewards/format_reward": 0.734375, | |
| "rewards/format_reward_staging": 0.703125, | |
| "step": 288 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 806.671875, | |
| "epoch": 288.8, | |
| "grad_norm": 365.24516465790214, | |
| "kl": 15.93310546875, | |
| "learning_rate": 4.335937630751674e-07, | |
| "loss": 0.422, | |
| "reward": 10.040624976158142, | |
| "reward_std": 6.964074335992336, | |
| "rewards/accuracy_reward_staging": 0.8900000145658851, | |
| "rewards/format_reward": 0.609375, | |
| "rewards/format_reward_staging": 0.53125, | |
| "step": 289 | |
| }, | |
| { | |
| "epoch": 289.8, | |
| "grad_norm": 2.018184843282375, | |
| "learning_rate": 4.2642356364895417e-07, | |
| "loss": 0.159, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 289.8, | |
| "eval_clip_ratio": 0.0, | |
| "eval_completion_length": 676.325, | |
| "eval_kl": 0.47177734375, | |
| "eval_loss": 0.08975062519311905, | |
| "eval_reward": 12.25, | |
| "eval_reward_std": 7.105983757972718, | |
| "eval_rewards/accuracy_reward_staging": 1.080000001192093, | |
| "eval_rewards/format_reward": 0.75, | |
| "eval_rewards/format_reward_staging": 0.7, | |
| "eval_runtime": 172.0863, | |
| "eval_samples_per_second": 0.116, | |
| "eval_steps_per_second": 0.029, | |
| "step": 290 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 756.625, | |
| "epoch": 290.8, | |
| "grad_norm": 4.304003408743737, | |
| "kl": 0.55126953125, | |
| "learning_rate": 4.192970442890602e-07, | |
| "loss": 0.1843, | |
| "reward": 10.37187498062849, | |
| "reward_std": 6.766278941184282, | |
| "rewards/accuracy_reward_staging": 0.9106250035110861, | |
| "rewards/format_reward": 0.671875, | |
| "rewards/format_reward_staging": 0.59375, | |
| "step": 291 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 722.5, | |
| "epoch": 291.8, | |
| "grad_norm": 1.4716878010716028, | |
| "kl": 0.4508056640625, | |
| "learning_rate": 4.1221474770752696e-07, | |
| "loss": 0.1393, | |
| "reward": 12.068749815225601, | |
| "reward_std": 6.005900785326958, | |
| "rewards/accuracy_reward_staging": 1.0553124994039536, | |
| "rewards/format_reward": 0.8125, | |
| "rewards/format_reward_staging": 0.703125, | |
| "step": 292 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 757.15625, | |
| "epoch": 292.8, | |
| "grad_norm": 1.6973289397039133, | |
| "kl": 0.4842529296875, | |
| "learning_rate": 4.0517721324865884e-07, | |
| "loss": 0.2115, | |
| "reward": 9.153125151991844, | |
| "reward_std": 7.310723379254341, | |
| "rewards/accuracy_reward_staging": 0.7809374900534749, | |
| "rewards/format_reward": 0.765625, | |
| "rewards/format_reward_staging": 0.578125, | |
| "step": 293 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 700.765625, | |
| "epoch": 293.8, | |
| "grad_norm": 2.1555090676393096, | |
| "kl": 0.51806640625, | |
| "learning_rate": 3.981849768479516e-07, | |
| "loss": 0.2632, | |
| "reward": 11.881249964237213, | |
| "reward_std": 7.455913960933685, | |
| "rewards/accuracy_reward_staging": 1.0396875217556953, | |
| "rewards/format_reward": 0.78125, | |
| "rewards/format_reward_staging": 0.703125, | |
| "step": 294 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 643.59375, | |
| "epoch": 294.8, | |
| "grad_norm": 1.6704306436530678, | |
| "kl": 0.459716796875, | |
| "learning_rate": 3.912385709912793e-07, | |
| "loss": 0.1352, | |
| "reward": 11.445312559604645, | |
| "reward_std": 7.4289940893650055, | |
| "rewards/accuracy_reward_staging": 1.008593775331974, | |
| "rewards/format_reward": 0.703125, | |
| "rewards/format_reward_staging": 0.65625, | |
| "step": 295 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 684.578125, | |
| "epoch": 295.8, | |
| "grad_norm": 1.7855400540888517, | |
| "kl": 0.528076171875, | |
| "learning_rate": 3.843385246743417e-07, | |
| "loss": 0.1663, | |
| "reward": 9.97968752682209, | |
| "reward_std": 8.184111461043358, | |
| "rewards/accuracy_reward_staging": 0.8635937534272671, | |
| "rewards/format_reward": 0.734375, | |
| "rewards/format_reward_staging": 0.609375, | |
| "step": 296 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 602.75, | |
| "epoch": 296.8, | |
| "grad_norm": 38.11905446217515, | |
| "kl": 1.2889404296875, | |
| "learning_rate": 3.774853633623806e-07, | |
| "loss": 0.2038, | |
| "reward": 11.821874991059303, | |
| "reward_std": 8.198160663247108, | |
| "rewards/accuracy_reward_staging": 1.0400000140070915, | |
| "rewards/format_reward": 0.75, | |
| "rewards/format_reward_staging": 0.671875, | |
| "step": 297 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 585.765625, | |
| "epoch": 297.8, | |
| "grad_norm": 1.8197669464076098, | |
| "kl": 0.5150146484375, | |
| "learning_rate": 3.706796089501627e-07, | |
| "loss": 0.0749, | |
| "reward": 11.434375017881393, | |
| "reward_std": 7.715316243469715, | |
| "rewards/accuracy_reward_staging": 0.9934375211596489, | |
| "rewards/format_reward": 0.796875, | |
| "rewards/format_reward_staging": 0.703125, | |
| "step": 298 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 613.03125, | |
| "epoch": 298.8, | |
| "grad_norm": 1.889354762217911, | |
| "kl": 0.4669189453125, | |
| "learning_rate": 3.639217797222359e-07, | |
| "loss": 0.1515, | |
| "reward": 11.481249898672104, | |
| "reward_std": 7.798143312335014, | |
| "rewards/accuracy_reward_staging": 1.0090625192970037, | |
| "rewards/format_reward": 0.71875, | |
| "rewards/format_reward_staging": 0.671875, | |
| "step": 299 | |
| }, | |
| { | |
| "epoch": 299.8, | |
| "grad_norm": 2.6622348023095666, | |
| "learning_rate": 3.5721239031346063e-07, | |
| "loss": 0.2454, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 299.8, | |
| "eval_clip_ratio": 0.0, | |
| "eval_completion_length": 590.3125, | |
| "eval_kl": 0.5787109375, | |
| "eval_loss": 0.10901384055614471, | |
| "eval_reward": 11.113749873638152, | |
| "eval_reward_std": 8.458426451683044, | |
| "eval_rewards/accuracy_reward_staging": 0.9851250126957893, | |
| "eval_rewards/format_reward": 0.65, | |
| "eval_rewards/format_reward_staging": 0.6125, | |
| "eval_runtime": 141.3322, | |
| "eval_samples_per_second": 0.142, | |
| "eval_steps_per_second": 0.035, | |
| "step": 300 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 614.75, | |
| "epoch": 300.8, | |
| "grad_norm": 1.5247800909935867, | |
| "kl": 0.50823974609375, | |
| "learning_rate": 3.5055195166981646e-07, | |
| "loss": 0.0888, | |
| "reward": 11.17812480777502, | |
| "reward_std": 7.858443755656481, | |
| "rewards/accuracy_reward_staging": 0.9834375130012631, | |
| "rewards/format_reward": 0.7421875, | |
| "rewards/format_reward_staging": 0.6015625, | |
| "step": 301 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 590.609375, | |
| "epoch": 301.8, | |
| "grad_norm": 11.67646802939455, | |
| "kl": 0.736083984375, | |
| "learning_rate": 3.4394097100949283e-07, | |
| "loss": 0.119, | |
| "reward": 11.040624797344208, | |
| "reward_std": 6.185135334730148, | |
| "rewards/accuracy_reward_staging": 0.9775000158697367, | |
| "rewards/format_reward": 0.65625, | |
| "rewards/format_reward_staging": 0.609375, | |
| "step": 302 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 592.828125, | |
| "epoch": 302.8, | |
| "grad_norm": 2.07620138910771, | |
| "kl": 0.5858154296875, | |
| "learning_rate": 3.373799517842627e-07, | |
| "loss": 0.1362, | |
| "reward": 9.728124901652336, | |
| "reward_std": 6.484044134616852, | |
| "rewards/accuracy_reward_staging": 0.8431250131689012, | |
| "rewards/format_reward": 0.671875, | |
| "rewards/format_reward_staging": 0.625, | |
| "step": 303 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 590.421875, | |
| "epoch": 303.8, | |
| "grad_norm": 1.80968722219858, | |
| "kl": 0.577392578125, | |
| "learning_rate": 3.308693936411421e-07, | |
| "loss": 0.1018, | |
| "reward": 9.63281275331974, | |
| "reward_std": 6.755122885107994, | |
| "rewards/accuracy_reward_staging": 0.827343761920929, | |
| "rewards/format_reward": 0.75, | |
| "rewards/format_reward_staging": 0.609375, | |
| "step": 304 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 568.5625, | |
| "epoch": 304.8, | |
| "grad_norm": 2.0406619631130307, | |
| "kl": 0.5341796875, | |
| "learning_rate": 3.2440979238433974e-07, | |
| "loss": 0.1264, | |
| "reward": 10.746875032782555, | |
| "reward_std": 7.6421735137701035, | |
| "rewards/accuracy_reward_staging": 0.9465625076554716, | |
| "rewards/format_reward": 0.671875, | |
| "rewards/format_reward_staging": 0.609375, | |
| "step": 305 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 624.703125, | |
| "epoch": 305.8, | |
| "grad_norm": 2524.9442790112334, | |
| "kl": 23.13818359375, | |
| "learning_rate": 3.180016399375016e-07, | |
| "loss": 0.9704, | |
| "reward": 10.040624886751175, | |
| "reward_std": 7.3795405626297, | |
| "rewards/accuracy_reward_staging": 0.8853125032037497, | |
| "rewards/format_reward": 0.640625, | |
| "rewards/format_reward_staging": 0.546875, | |
| "step": 306 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 599.765625, | |
| "epoch": 306.8, | |
| "grad_norm": 22.332538208315107, | |
| "kl": 0.8387451171875, | |
| "learning_rate": 3.1164542430624586e-07, | |
| "loss": 0.1017, | |
| "reward": 11.599999964237213, | |
| "reward_std": 6.432632386684418, | |
| "rewards/accuracy_reward_staging": 1.035000003874302, | |
| "rewards/format_reward": 0.625, | |
| "rewards/format_reward_staging": 0.625, | |
| "step": 307 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 612.078125, | |
| "epoch": 307.8, | |
| "grad_norm": 103.35305685554316, | |
| "kl": 0.972900390625, | |
| "learning_rate": 3.0534162954100263e-07, | |
| "loss": 0.1358, | |
| "reward": 9.193749964237213, | |
| "reward_std": 7.3355728685855865, | |
| "rewards/accuracy_reward_staging": 0.7959375064820051, | |
| "rewards/format_reward": 0.65625, | |
| "rewards/format_reward_staging": 0.578125, | |
| "step": 308 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 570.96875, | |
| "epoch": 308.8, | |
| "grad_norm": 50.34680954325702, | |
| "kl": 1.0068359375, | |
| "learning_rate": 2.990907357001491e-07, | |
| "loss": 0.0898, | |
| "reward": 9.468749985098839, | |
| "reward_std": 7.638161733746529, | |
| "rewards/accuracy_reward_staging": 0.8328124992549419, | |
| "rewards/format_reward": 0.59375, | |
| "rewards/format_reward_staging": 0.546875, | |
| "step": 309 | |
| }, | |
| { | |
| "epoch": 309.8, | |
| "grad_norm": 2.404150826295122, | |
| "learning_rate": 2.9289321881345254e-07, | |
| "loss": 0.125, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 309.8, | |
| "eval_clip_ratio": 0.0, | |
| "eval_completion_length": 544.0375, | |
| "eval_kl": 0.5705078125, | |
| "eval_loss": 0.10134752094745636, | |
| "eval_reward": 11.172499823570252, | |
| "eval_reward_std": 7.30764594078064, | |
| "eval_rewards/accuracy_reward_staging": 0.9935000017285347, | |
| "eval_rewards/format_reward": 0.625, | |
| "eval_rewards/format_reward_staging": 0.6125, | |
| "eval_runtime": 125.2508, | |
| "eval_samples_per_second": 0.16, | |
| "eval_steps_per_second": 0.04, | |
| "step": 310 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 594.6796875, | |
| "epoch": 310.8, | |
| "grad_norm": 4.496947390349957, | |
| "kl": 0.5538330078125, | |
| "learning_rate": 2.867495508458185e-07, | |
| "loss": 0.0803, | |
| "reward": 11.534374952316284, | |
| "reward_std": 6.893470458686352, | |
| "rewards/accuracy_reward_staging": 1.0221875067800283, | |
| "rewards/format_reward": 0.671875, | |
| "rewards/format_reward_staging": 0.640625, | |
| "step": 311 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 604.53125, | |
| "epoch": 311.8, | |
| "grad_norm": 1.8183482676019072, | |
| "kl": 0.561279296875, | |
| "learning_rate": 2.8066019966134904e-07, | |
| "loss": 0.1272, | |
| "reward": 10.914062589406967, | |
| "reward_std": 7.2590411230921745, | |
| "rewards/accuracy_reward_staging": 0.9617187697440386, | |
| "rewards/format_reward": 0.65625, | |
| "rewards/format_reward_staging": 0.640625, | |
| "step": 312 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 588.84375, | |
| "epoch": 312.8, | |
| "grad_norm": 2.2241952673115684, | |
| "kl": 0.574951171875, | |
| "learning_rate": 2.7462562898771256e-07, | |
| "loss": 0.0993, | |
| "reward": 11.553124994039536, | |
| "reward_std": 8.42781974375248, | |
| "rewards/accuracy_reward_staging": 1.0303124962374568, | |
| "rewards/format_reward": 0.671875, | |
| "rewards/format_reward_staging": 0.578125, | |
| "step": 313 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 551.140625, | |
| "epoch": 313.8, | |
| "grad_norm": 10.414899131526484, | |
| "kl": 0.54833984375, | |
| "learning_rate": 2.6864629838082954e-07, | |
| "loss": 0.0799, | |
| "reward": 11.815624713897705, | |
| "reward_std": 6.541705533862114, | |
| "rewards/accuracy_reward_staging": 1.0659374967217445, | |
| "rewards/format_reward": 0.59375, | |
| "rewards/format_reward_staging": 0.5625, | |
| "step": 314 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 553.046875, | |
| "epoch": 314.8, | |
| "grad_norm": 47.0803214727609, | |
| "kl": 0.74169921875, | |
| "learning_rate": 2.62722663189876e-07, | |
| "loss": 0.082, | |
| "reward": 8.54062494635582, | |
| "reward_std": 8.188691228628159, | |
| "rewards/accuracy_reward_staging": 0.7603125041350722, | |
| "rewards/format_reward": 0.453125, | |
| "rewards/format_reward_staging": 0.484375, | |
| "step": 315 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 568.03125, | |
| "epoch": 315.8, | |
| "grad_norm": 1.697069435904432, | |
| "kl": 0.52001953125, | |
| "learning_rate": 2.568551745226056e-07, | |
| "loss": 0.0445, | |
| "reward": 11.301562577486038, | |
| "reward_std": 7.986581727862358, | |
| "rewards/accuracy_reward_staging": 0.9989062640815973, | |
| "rewards/format_reward": 0.65625, | |
| "rewards/format_reward_staging": 0.65625, | |
| "step": 316 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 604.609375, | |
| "epoch": 316.8, | |
| "grad_norm": 1.9224538433799612, | |
| "kl": 0.5042724609375, | |
| "learning_rate": 2.510442792109978e-07, | |
| "loss": 0.1747, | |
| "reward": 11.126562386751175, | |
| "reward_std": 8.233518898487091, | |
| "rewards/accuracy_reward_staging": 0.9829687615856528, | |
| "rewards/format_reward": 0.703125, | |
| "rewards/format_reward_staging": 0.59375, | |
| "step": 317 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 540.96875, | |
| "epoch": 317.8, | |
| "grad_norm": 1.8852768343458246, | |
| "kl": 0.50634765625, | |
| "learning_rate": 2.45290419777228e-07, | |
| "loss": 0.0711, | |
| "reward": 10.67187511920929, | |
| "reward_std": 7.687072329223156, | |
| "rewards/accuracy_reward_staging": 0.9312500208616257, | |
| "rewards/format_reward": 0.6875, | |
| "rewards/format_reward_staging": 0.671875, | |
| "step": 318 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 586.21875, | |
| "epoch": 318.8, | |
| "grad_norm": 1.9063098360507449, | |
| "kl": 0.60400390625, | |
| "learning_rate": 2.395940343999691e-07, | |
| "loss": 0.072, | |
| "reward": 9.948437467217445, | |
| "reward_std": 8.150914326310158, | |
| "rewards/accuracy_reward_staging": 0.8745312532410026, | |
| "rewards/format_reward": 0.65625, | |
| "rewards/format_reward_staging": 0.546875, | |
| "step": 319 | |
| }, | |
| { | |
| "epoch": 319.8, | |
| "grad_norm": 2.0586826121884196, | |
| "learning_rate": 2.339555568810221e-07, | |
| "loss": 0.0995, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 319.8, | |
| "eval_clip_ratio": 0.0, | |
| "eval_completion_length": 624.5875, | |
| "eval_kl": 1.31884765625, | |
| "eval_loss": 0.14124900102615356, | |
| "eval_reward": 10.472500038146972, | |
| "eval_reward_std": 8.316550683975219, | |
| "eval_rewards/accuracy_reward_staging": 0.9397500105202198, | |
| "eval_rewards/format_reward": 0.55, | |
| "eval_rewards/format_reward_staging": 0.525, | |
| "eval_runtime": 166.2792, | |
| "eval_samples_per_second": 0.12, | |
| "eval_steps_per_second": 0.03, | |
| "step": 320 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 531.609375, | |
| "epoch": 320.8, | |
| "grad_norm": 3.9996163379427734, | |
| "kl": 0.682861328125, | |
| "learning_rate": 2.283754166122802e-07, | |
| "loss": 0.1015, | |
| "reward": 11.93515631556511, | |
| "reward_std": 7.991424214094877, | |
| "rewards/accuracy_reward_staging": 1.0638281423598528, | |
| "rewards/format_reward": 0.65625, | |
| "rewards/format_reward_staging": 0.640625, | |
| "step": 321 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 536.390625, | |
| "epoch": 321.8, | |
| "grad_norm": 3.0819342365087095, | |
| "kl": 0.537841796875, | |
| "learning_rate": 2.228540385430291e-07, | |
| "loss": 0.0322, | |
| "reward": 10.235937386751175, | |
| "reward_std": 7.312522612512112, | |
| "rewards/accuracy_reward_staging": 0.9110937523655593, | |
| "rewards/format_reward": 0.5625, | |
| "rewards/format_reward_staging": 0.5625, | |
| "step": 322 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 564.328125, | |
| "epoch": 322.8, | |
| "grad_norm": 2.26675304418193, | |
| "kl": 0.61669921875, | |
| "learning_rate": 2.1739184314758607e-07, | |
| "loss": 0.1112, | |
| "reward": 9.662499949336052, | |
| "reward_std": 7.8978844210505486, | |
| "rewards/accuracy_reward_staging": 0.8537500146776438, | |
| "rewards/format_reward": 0.578125, | |
| "rewards/format_reward_staging": 0.546875, | |
| "step": 323 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 561.875, | |
| "epoch": 323.8, | |
| "grad_norm": 1.9954959778735344, | |
| "kl": 0.582763671875, | |
| "learning_rate": 2.1198924639327808e-07, | |
| "loss": 0.1118, | |
| "reward": 9.614062517881393, | |
| "reward_std": 8.075859874486923, | |
| "rewards/accuracy_reward_staging": 0.848906246945262, | |
| "rewards/format_reward": 0.578125, | |
| "rewards/format_reward_staging": 0.546875, | |
| "step": 324 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 513.0625, | |
| "epoch": 324.8, | |
| "grad_norm": 1.9291434012665167, | |
| "kl": 0.508056640625, | |
| "learning_rate": 2.0664665970876495e-07, | |
| "loss": 0.09, | |
| "reward": 11.331250041723251, | |
| "reward_std": 7.7114517986774445, | |
| "rewards/accuracy_reward_staging": 0.9878125172108412, | |
| "rewards/format_reward": 0.765625, | |
| "rewards/format_reward_staging": 0.6875, | |
| "step": 325 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 531.9375, | |
| "epoch": 325.8, | |
| "grad_norm": 2.130045177921353, | |
| "kl": 0.6065673828125, | |
| "learning_rate": 2.0136448995270738e-07, | |
| "loss": 0.1138, | |
| "reward": 9.910937532782555, | |
| "reward_std": 7.951791629195213, | |
| "rewards/accuracy_reward_staging": 0.8645312692970037, | |
| "rewards/format_reward": 0.65625, | |
| "rewards/format_reward_staging": 0.609375, | |
| "step": 326 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 590.3125, | |
| "epoch": 326.8, | |
| "grad_norm": 14.919466822673119, | |
| "kl": 0.8441162109375, | |
| "learning_rate": 1.961431393827827e-07, | |
| "loss": 0.1238, | |
| "reward": 9.217187523841858, | |
| "reward_std": 7.790774069726467, | |
| "rewards/accuracy_reward_staging": 0.7967187650501728, | |
| "rewards/format_reward": 0.65625, | |
| "rewards/format_reward_staging": 0.59375, | |
| "step": 327 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 581.578125, | |
| "epoch": 327.8, | |
| "grad_norm": 2.201816976112108, | |
| "kl": 0.5943603515625, | |
| "learning_rate": 1.9098300562505264e-07, | |
| "loss": 0.1018, | |
| "reward": 10.471874967217445, | |
| "reward_std": 7.630053393542767, | |
| "rewards/accuracy_reward_staging": 0.9268750208429992, | |
| "rewards/format_reward": 0.640625, | |
| "rewards/format_reward_staging": 0.5625, | |
| "step": 328 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 556.71875, | |
| "epoch": 328.8, | |
| "grad_norm": 5.906945093072998, | |
| "kl": 0.6585693359375, | |
| "learning_rate": 1.8588448164368087e-07, | |
| "loss": 0.0954, | |
| "reward": 10.989062532782555, | |
| "reward_std": 7.685741938650608, | |
| "rewards/accuracy_reward_staging": 0.972343759611249, | |
| "rewards/format_reward": 0.671875, | |
| "rewards/format_reward_staging": 0.59375, | |
| "step": 329 | |
| }, | |
| { | |
| "epoch": 329.8, | |
| "grad_norm": 2.0853287849886426, | |
| "learning_rate": 1.8084795571100809e-07, | |
| "loss": 0.0689, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 329.8, | |
| "eval_clip_ratio": 0.0, | |
| "eval_completion_length": 538.2375, | |
| "eval_kl": 0.81767578125, | |
| "eval_loss": 0.1242348700761795, | |
| "eval_reward": 11.619999825954437, | |
| "eval_reward_std": 8.338389962911606, | |
| "eval_rewards/accuracy_reward_staging": 1.0357500161975621, | |
| "eval_rewards/format_reward": 0.6375, | |
| "eval_rewards/format_reward_staging": 0.625, | |
| "eval_runtime": 130.5157, | |
| "eval_samples_per_second": 0.153, | |
| "eval_steps_per_second": 0.038, | |
| "step": 330 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 552.21875, | |
| "epoch": 330.8, | |
| "grad_norm": 1.9411099886639303, | |
| "kl": 0.55401611328125, | |
| "learning_rate": 1.758738113779843e-07, | |
| "loss": 0.1164, | |
| "reward": 12.410937391221523, | |
| "reward_std": 6.827382728457451, | |
| "rewards/accuracy_reward_staging": 1.1129687398206443, | |
| "rewards/format_reward": 0.65625, | |
| "rewards/format_reward_staging": 0.625, | |
| "step": 331 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 550.75, | |
| "epoch": 331.8, | |
| "grad_norm": 2.1891389339166056, | |
| "kl": 0.5709228515625, | |
| "learning_rate": 1.7096242744495838e-07, | |
| "loss": -0.0025, | |
| "reward": 11.460937261581421, | |
| "reward_std": 6.8272934183478355, | |
| "rewards/accuracy_reward_staging": 1.0195312476716936, | |
| "rewards/format_reward": 0.671875, | |
| "rewards/format_reward_staging": 0.59375, | |
| "step": 332 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 524.5, | |
| "epoch": 332.8, | |
| "grad_norm": 2.0777537885395376, | |
| "kl": 0.536376953125, | |
| "learning_rate": 1.661141779328319e-07, | |
| "loss": 0.0535, | |
| "reward": 12.75, | |
| "reward_std": 7.35980150103569, | |
| "rewards/accuracy_reward_staging": 1.1328125018626451, | |
| "rewards/format_reward": 0.71875, | |
| "rewards/format_reward_staging": 0.703125, | |
| "step": 333 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 499.59375, | |
| "epoch": 333.8, | |
| "grad_norm": 4.1435358861337095, | |
| "kl": 0.5528564453125, | |
| "learning_rate": 1.6132943205457606e-07, | |
| "loss": 0.0848, | |
| "reward": 10.412499874830246, | |
| "reward_std": 8.24751353263855, | |
| "rewards/accuracy_reward_staging": 0.9053125167265534, | |
| "rewards/format_reward": 0.71875, | |
| "rewards/format_reward_staging": 0.640625, | |
| "step": 334 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 528.609375, | |
| "epoch": 334.8, | |
| "grad_norm": 2.837368202419978, | |
| "kl": 0.573486328125, | |
| "learning_rate": 1.566085541871145e-07, | |
| "loss": 0.0901, | |
| "reward": 13.03749993443489, | |
| "reward_std": 7.35784338414669, | |
| "rewards/accuracy_reward_staging": 1.1756250127218664, | |
| "rewards/format_reward": 0.640625, | |
| "rewards/format_reward_staging": 0.640625, | |
| "step": 335 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 536.09375, | |
| "epoch": 335.8, | |
| "grad_norm": 1.9595695246588565, | |
| "kl": 0.5914306640625, | |
| "learning_rate": 1.5195190384357404e-07, | |
| "loss": 0.0832, | |
| "reward": 9.493749916553497, | |
| "reward_std": 6.751585811376572, | |
| "rewards/accuracy_reward_staging": 0.8478124998509884, | |
| "rewards/format_reward": 0.546875, | |
| "rewards/format_reward_staging": 0.46875, | |
| "step": 336 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 519.390625, | |
| "epoch": 336.8, | |
| "grad_norm": 2.110347695686011, | |
| "kl": 0.519287109375, | |
| "learning_rate": 1.473598356459078e-07, | |
| "loss": 0.0886, | |
| "reward": 12.745312631130219, | |
| "reward_std": 7.440838478505611, | |
| "rewards/accuracy_reward_staging": 1.122968764975667, | |
| "rewards/format_reward": 0.796875, | |
| "rewards/format_reward_staging": 0.71875, | |
| "step": 337 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 547.65625, | |
| "epoch": 337.8, | |
| "grad_norm": 1.727304431939535, | |
| "kl": 0.4608154296875, | |
| "learning_rate": 1.4283269929788776e-07, | |
| "loss": 0.0791, | |
| "reward": 12.464062497019768, | |
| "reward_std": 6.970313638448715, | |
| "rewards/accuracy_reward_staging": 1.1073437612503767, | |
| "rewards/format_reward": 0.703125, | |
| "rewards/format_reward_staging": 0.6875, | |
| "step": 338 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 516.578125, | |
| "epoch": 338.8, | |
| "grad_norm": 45.73313544647479, | |
| "kl": 0.8197021484375, | |
| "learning_rate": 1.3837083955847417e-07, | |
| "loss": 0.102, | |
| "reward": 11.170312657952309, | |
| "reward_std": 7.535245016217232, | |
| "rewards/accuracy_reward_staging": 0.9779687668196857, | |
| "rewards/format_reward": 0.71875, | |
| "rewards/format_reward_staging": 0.671875, | |
| "step": 339 | |
| }, | |
| { | |
| "epoch": 339.8, | |
| "grad_norm": 2.6477795175870518, | |
| "learning_rate": 1.3397459621556128e-07, | |
| "loss": 0.1162, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 339.8, | |
| "eval_clip_ratio": 0.0, | |
| "eval_completion_length": 529.1375, | |
| "eval_kl": 23.68701171875, | |
| "eval_loss": 1.2650219202041626, | |
| "eval_reward": 10.880000054836273, | |
| "eval_reward_std": 7.727067697048187, | |
| "eval_rewards/accuracy_reward_staging": 0.9605000212788581, | |
| "eval_rewards/format_reward": 0.65, | |
| "eval_rewards/format_reward_staging": 0.625, | |
| "eval_runtime": 120.192, | |
| "eval_samples_per_second": 0.166, | |
| "eval_steps_per_second": 0.042, | |
| "step": 340 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 560.4375, | |
| "epoch": 340.8, | |
| "grad_norm": 2.062257482287411, | |
| "kl": 0.56280517578125, | |
| "learning_rate": 1.296443040601003e-07, | |
| "loss": 0.0435, | |
| "reward": 10.376562610268593, | |
| "reward_std": 8.002589859068394, | |
| "rewards/accuracy_reward_staging": 0.9189062397927046, | |
| "rewards/format_reward": 0.6171875, | |
| "rewards/format_reward_staging": 0.5703125, | |
| "step": 341 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 539.75, | |
| "epoch": 341.8, | |
| "grad_norm": 2.2745767425023513, | |
| "kl": 0.53656005859375, | |
| "learning_rate": 1.2538029286060424e-07, | |
| "loss": 0.0623, | |
| "reward": 11.88906241953373, | |
| "reward_std": 8.809318155050278, | |
| "rewards/accuracy_reward_staging": 1.0607812739908695, | |
| "rewards/format_reward": 0.671875, | |
| "rewards/format_reward_staging": 0.609375, | |
| "step": 342 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 537.71875, | |
| "epoch": 342.8, | |
| "grad_norm": 1.9073471200149112, | |
| "kl": 0.48541259765625, | |
| "learning_rate": 1.2118288733803472e-07, | |
| "loss": 0.1154, | |
| "reward": 14.631249979138374, | |
| "reward_std": 6.283873476088047, | |
| "rewards/accuracy_reward_staging": 1.3256250023841858, | |
| "rewards/format_reward": 0.703125, | |
| "rewards/format_reward_staging": 0.671875, | |
| "step": 343 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 548.6875, | |
| "epoch": 343.8, | |
| "grad_norm": 2.2191165734186025, | |
| "kl": 0.6094970703125, | |
| "learning_rate": 1.1705240714107301e-07, | |
| "loss": 0.1021, | |
| "reward": 9.640625074505806, | |
| "reward_std": 8.212526381015778, | |
| "rewards/accuracy_reward_staging": 0.8593750139698386, | |
| "rewards/format_reward": 0.53125, | |
| "rewards/format_reward_staging": 0.515625, | |
| "step": 344 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 513.8125, | |
| "epoch": 344.8, | |
| "grad_norm": 1.9854032388069334, | |
| "kl": 0.523681640625, | |
| "learning_rate": 1.1298916682177828e-07, | |
| "loss": 0.0055, | |
| "reward": 14.028124868869781, | |
| "reward_std": 6.586119674146175, | |
| "rewards/accuracy_reward_staging": 1.2528125252574682, | |
| "rewards/format_reward": 0.765625, | |
| "rewards/format_reward_staging": 0.734375, | |
| "step": 345 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 538.0, | |
| "epoch": 345.8, | |
| "grad_norm": 1.922770111948013, | |
| "kl": 0.4796142578125, | |
| "learning_rate": 1.089934758116322e-07, | |
| "loss": 0.0353, | |
| "reward": 13.531249672174454, | |
| "reward_std": 7.330604811664671, | |
| "rewards/accuracy_reward_staging": 1.2109374832361937, | |
| "rewards/format_reward": 0.71875, | |
| "rewards/format_reward_staging": 0.703125, | |
| "step": 346 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 556.25, | |
| "epoch": 346.8, | |
| "grad_norm": 1.9617536727014244, | |
| "kl": 0.6065673828125, | |
| "learning_rate": 1.05065638397975e-07, | |
| "loss": 0.0607, | |
| "reward": 10.37343730032444, | |
| "reward_std": 7.6566493809223175, | |
| "rewards/accuracy_reward_staging": 0.9264062475413084, | |
| "rewards/format_reward": 0.5625, | |
| "rewards/format_reward_staging": 0.546875, | |
| "step": 347 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 514.421875, | |
| "epoch": 347.8, | |
| "grad_norm": 5.910623715329322, | |
| "kl": 0.586669921875, | |
| "learning_rate": 1.0120595370083318e-07, | |
| "loss": 0.0441, | |
| "reward": 10.33906227350235, | |
| "reward_std": 5.6453575268387794, | |
| "rewards/accuracy_reward_staging": 0.9151562377810478, | |
| "rewards/format_reward": 0.578125, | |
| "rewards/format_reward_staging": 0.609375, | |
| "step": 348 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 505.5625, | |
| "epoch": 348.8, | |
| "grad_norm": 2.2977218203304983, | |
| "kl": 0.5316162109375, | |
| "learning_rate": 9.741471565013958e-08, | |
| "loss": 0.0605, | |
| "reward": 12.892187714576721, | |
| "reward_std": 7.930499374866486, | |
| "rewards/accuracy_reward_staging": 1.1439062617719173, | |
| "rewards/format_reward": 0.75, | |
| "rewards/format_reward_staging": 0.703125, | |
| "step": 349 | |
| }, | |
| { | |
| "epoch": 349.8, | |
| "grad_norm": 1.8861216023880358, | |
| "learning_rate": 9.369221296335006e-08, | |
| "loss": 0.1235, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 349.8, | |
| "eval_clip_ratio": 0.0, | |
| "eval_completion_length": 520.25, | |
| "eval_kl": 0.54873046875, | |
| "eval_loss": 0.06522668898105621, | |
| "eval_reward": 12.468749737739563, | |
| "eval_reward_std": 7.662279307842255, | |
| "eval_rewards/accuracy_reward_staging": 1.1093750052154063, | |
| "eval_rewards/format_reward": 0.725, | |
| "eval_rewards/format_reward_staging": 0.65, | |
| "eval_runtime": 127.6467, | |
| "eval_samples_per_second": 0.157, | |
| "eval_steps_per_second": 0.039, | |
| "step": 350 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 525.09375, | |
| "epoch": 350.8, | |
| "grad_norm": 2.5663009167562287, | |
| "kl": 0.54559326171875, | |
| "learning_rate": 9.003872912345689e-08, | |
| "loss": 0.0985, | |
| "reward": 10.989843875169754, | |
| "reward_std": 7.33668365329504, | |
| "rewards/accuracy_reward_staging": 0.9677343829534948, | |
| "rewards/format_reward": 0.671875, | |
| "rewards/format_reward_staging": 0.640625, | |
| "step": 351 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 566.671875, | |
| "epoch": 351.8, | |
| "grad_norm": 3.306328897391672, | |
| "kl": 0.5091552734375, | |
| "learning_rate": 8.645454235739902e-08, | |
| "loss": 0.11, | |
| "reward": 11.37343755364418, | |
| "reward_std": 7.7504191398620605, | |
| "rewards/accuracy_reward_staging": 1.009218767285347, | |
| "rewards/format_reward": 0.671875, | |
| "rewards/format_reward_staging": 0.609375, | |
| "step": 352 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 525.484375, | |
| "epoch": 352.8, | |
| "grad_norm": 1.979792175196284, | |
| "kl": 0.520751953125, | |
| "learning_rate": 8.293992561487595e-08, | |
| "loss": 0.034, | |
| "reward": 10.399999901652336, | |
| "reward_std": 6.3909139558672905, | |
| "rewards/accuracy_reward_staging": 0.9212500043213367, | |
| "rewards/format_reward": 0.625, | |
| "rewards/format_reward_staging": 0.5625, | |
| "step": 353 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 505.015625, | |
| "epoch": 353.8, | |
| "grad_norm": 94.2447409316946, | |
| "kl": 2.2459716796875, | |
| "learning_rate": 7.949514654755962e-08, | |
| "loss": 0.1011, | |
| "reward": 10.320312514901161, | |
| "reward_std": 6.939793795347214, | |
| "rewards/accuracy_reward_staging": 0.9054687642492354, | |
| "rewards/format_reward": 0.65625, | |
| "rewards/format_reward_staging": 0.609375, | |
| "step": 354 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 551.140625, | |
| "epoch": 354.8, | |
| "grad_norm": 2.0400310680209444, | |
| "kl": 0.5252685546875, | |
| "learning_rate": 7.612046748871326e-08, | |
| "loss": 0.0919, | |
| "reward": 10.507812529802322, | |
| "reward_std": 6.574328362941742, | |
| "rewards/accuracy_reward_staging": 0.9257812593132257, | |
| "rewards/format_reward": 0.625, | |
| "rewards/format_reward_staging": 0.625, | |
| "step": 355 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 509.78125, | |
| "epoch": 355.8, | |
| "grad_norm": 10.548554530431085, | |
| "kl": 0.63427734375, | |
| "learning_rate": 7.281614543321269e-08, | |
| "loss": 0.0484, | |
| "reward": 12.018749877810478, | |
| "reward_std": 7.941742122173309, | |
| "rewards/accuracy_reward_staging": 1.0721874982118607, | |
| "rewards/format_reward": 0.671875, | |
| "rewards/format_reward_staging": 0.625, | |
| "step": 356 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 496.859375, | |
| "epoch": 356.8, | |
| "grad_norm": 2.9275259721912263, | |
| "kl": 0.5072021484375, | |
| "learning_rate": 6.958243201797553e-08, | |
| "loss": 0.0034, | |
| "reward": 13.354687303304672, | |
| "reward_std": 6.157866388559341, | |
| "rewards/accuracy_reward_staging": 1.1901562474668026, | |
| "rewards/format_reward": 0.71875, | |
| "rewards/format_reward_staging": 0.734375, | |
| "step": 357 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 497.078125, | |
| "epoch": 357.8, | |
| "grad_norm": 3.259670897497323, | |
| "kl": 0.56689453125, | |
| "learning_rate": 6.641957350279837e-08, | |
| "loss": 0.1212, | |
| "reward": 10.450000002980232, | |
| "reward_std": 8.285482600331306, | |
| "rewards/accuracy_reward_staging": 0.9262500102631748, | |
| "rewards/format_reward": 0.59375, | |
| "rewards/format_reward_staging": 0.59375, | |
| "step": 358 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 586.21875, | |
| "epoch": 358.8, | |
| "grad_norm": 3.6157884851011115, | |
| "kl": 0.7269287109375, | |
| "learning_rate": 6.332781075160243e-08, | |
| "loss": 0.1011, | |
| "reward": 8.88906255364418, | |
| "reward_std": 8.714880511164665, | |
| "rewards/accuracy_reward_staging": 0.7857812475413084, | |
| "rewards/format_reward": 0.53125, | |
| "rewards/format_reward_staging": 0.5, | |
| "step": 359 | |
| }, | |
| { | |
| "epoch": 359.8, | |
| "grad_norm": 3.272297448604345, | |
| "learning_rate": 6.030737921409168e-08, | |
| "loss": 0.0793, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 359.8, | |
| "eval_clip_ratio": 0.0, | |
| "eval_completion_length": 534.5875, | |
| "eval_kl": 0.5283203125, | |
| "eval_loss": 0.10396303236484528, | |
| "eval_reward": 10.23999993801117, | |
| "eval_reward_std": 7.775251030921936, | |
| "eval_rewards/accuracy_reward_staging": 0.9027499988675117, | |
| "eval_rewards/format_reward": 0.6375, | |
| "eval_rewards/format_reward_staging": 0.575, | |
| "eval_runtime": 125.3052, | |
| "eval_samples_per_second": 0.16, | |
| "eval_steps_per_second": 0.04, | |
| "step": 360 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 529.671875, | |
| "epoch": 360.8, | |
| "grad_norm": 2.443851018491839, | |
| "kl": 0.60870361328125, | |
| "learning_rate": 5.735850890782157e-08, | |
| "loss": 0.0695, | |
| "reward": 11.39765627682209, | |
| "reward_std": 7.690338987857103, | |
| "rewards/accuracy_reward_staging": 1.0139843788929284, | |
| "rewards/format_reward": 0.6328125, | |
| "rewards/format_reward_staging": 0.625, | |
| "step": 361 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 525.890625, | |
| "epoch": 361.8, | |
| "grad_norm": 4.571161669605146, | |
| "kl": 0.7027587890625, | |
| "learning_rate": 5.448142440068315e-08, | |
| "loss": 0.0926, | |
| "reward": 12.457812368869781, | |
| "reward_std": 8.532767742872238, | |
| "rewards/accuracy_reward_staging": 1.097343759611249, | |
| "rewards/format_reward": 0.765625, | |
| "rewards/format_reward_staging": 0.71875, | |
| "step": 362 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 540.40625, | |
| "epoch": 362.8, | |
| "grad_norm": 2.6127548669423333, | |
| "kl": 0.56591796875, | |
| "learning_rate": 5.1676344793800675e-08, | |
| "loss": 0.1056, | |
| "reward": 10.995312467217445, | |
| "reward_std": 7.668447345495224, | |
| "rewards/accuracy_reward_staging": 0.9682812597602606, | |
| "rewards/format_reward": 0.65625, | |
| "rewards/format_reward_staging": 0.65625, | |
| "step": 363 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 518.546875, | |
| "epoch": 363.8, | |
| "grad_norm": 2.0552105230802944, | |
| "kl": 0.52197265625, | |
| "learning_rate": 4.8943483704846465e-08, | |
| "loss": 0.1321, | |
| "reward": 11.596875041723251, | |
| "reward_std": 8.213591203093529, | |
| "rewards/accuracy_reward_staging": 1.0190625078976154, | |
| "rewards/format_reward": 0.71875, | |
| "rewards/format_reward_staging": 0.6875, | |
| "step": 364 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 505.765625, | |
| "epoch": 364.8, | |
| "grad_norm": 2.326851393730122, | |
| "kl": 0.5279541015625, | |
| "learning_rate": 4.6283049251773176e-08, | |
| "loss": 0.0553, | |
| "reward": 11.739062711596489, | |
| "reward_std": 7.672401025891304, | |
| "rewards/accuracy_reward_staging": 1.0348437502980232, | |
| "rewards/format_reward": 0.75, | |
| "rewards/format_reward_staging": 0.640625, | |
| "step": 365 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 549.078125, | |
| "epoch": 365.8, | |
| "grad_norm": 5.111208968520272, | |
| "kl": 0.6072998046875, | |
| "learning_rate": 4.3695244036964564e-08, | |
| "loss": 0.082, | |
| "reward": 9.354687571525574, | |
| "reward_std": 7.511838540434837, | |
| "rewards/accuracy_reward_staging": 0.8167187636718154, | |
| "rewards/format_reward": 0.609375, | |
| "rewards/format_reward_staging": 0.578125, | |
| "step": 366 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 517.296875, | |
| "epoch": 366.8, | |
| "grad_norm": 2.581279648987323, | |
| "kl": 0.5887451171875, | |
| "learning_rate": 4.1180265131806946e-08, | |
| "loss": 0.0205, | |
| "reward": 10.934374988079071, | |
| "reward_std": 8.458094909787178, | |
| "rewards/accuracy_reward_staging": 0.9778125118464231, | |
| "rewards/format_reward": 0.59375, | |
| "rewards/format_reward_staging": 0.5625, | |
| "step": 367 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 518.40625, | |
| "epoch": 367.8, | |
| "grad_norm": 2.035652649324917, | |
| "kl": 0.569091796875, | |
| "learning_rate": 3.87383040616811e-08, | |
| "loss": 0.0615, | |
| "reward": 11.773437261581421, | |
| "reward_std": 7.848160028457642, | |
| "rewards/accuracy_reward_staging": 1.0507812481373549, | |
| "rewards/format_reward": 0.65625, | |
| "rewards/format_reward_staging": 0.609375, | |
| "step": 368 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 568.703125, | |
| "epoch": 368.8, | |
| "grad_norm": 2.8704472982245526, | |
| "kl": 0.57958984375, | |
| "learning_rate": 3.636954679137705e-08, | |
| "loss": 0.081, | |
| "reward": 9.676562517881393, | |
| "reward_std": 8.41864463686943, | |
| "rewards/accuracy_reward_staging": 0.8692187499254942, | |
| "rewards/format_reward": 0.515625, | |
| "rewards/format_reward_staging": 0.46875, | |
| "step": 369 | |
| }, | |
| { | |
| "epoch": 369.8, | |
| "grad_norm": 8.688192162327972, | |
| "learning_rate": 3.4074173710931796e-08, | |
| "loss": 0.092, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 369.8, | |
| "eval_clip_ratio": 0.0, | |
| "eval_completion_length": 537.7875, | |
| "eval_kl": 0.5083984375, | |
| "eval_loss": 0.07736475765705109, | |
| "eval_reward": 11.122499930858613, | |
| "eval_reward_std": 6.3223115285858515, | |
| "eval_rewards/accuracy_reward_staging": 0.989750000461936, | |
| "eval_rewards/format_reward": 0.6375, | |
| "eval_rewards/format_reward_staging": 0.5875, | |
| "eval_runtime": 137.4717, | |
| "eval_samples_per_second": 0.145, | |
| "eval_steps_per_second": 0.036, | |
| "step": 370 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 548.3203125, | |
| "epoch": 370.8, | |
| "grad_norm": 2.9935915085575524, | |
| "kl": 0.7093505859375, | |
| "learning_rate": 3.185235962189237e-08, | |
| "loss": 0.086, | |
| "reward": 10.013281270861626, | |
| "reward_std": 7.9125730618834496, | |
| "rewards/accuracy_reward_staging": 0.892734372522682, | |
| "rewards/format_reward": 0.5625, | |
| "rewards/format_reward_staging": 0.5234375, | |
| "step": 371 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 507.0, | |
| "epoch": 371.8, | |
| "grad_norm": 2.2371928274108455, | |
| "kl": 0.517822265625, | |
| "learning_rate": 2.9704273724003526e-08, | |
| "loss": 0.0676, | |
| "reward": 11.292187675833702, | |
| "reward_std": 6.194611236453056, | |
| "rewards/accuracy_reward_staging": 1.0042187473736703, | |
| "rewards/format_reward": 0.65625, | |
| "rewards/format_reward_staging": 0.59375, | |
| "step": 372 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 559.84375, | |
| "epoch": 372.8, | |
| "grad_norm": 2.531647136891463, | |
| "kl": 0.52001953125, | |
| "learning_rate": 2.7630079602323443e-08, | |
| "loss": 0.1587, | |
| "reward": 10.668749883770943, | |
| "reward_std": 8.836217179894447, | |
| "rewards/accuracy_reward_staging": 0.9575000163167715, | |
| "rewards/format_reward": 0.546875, | |
| "rewards/format_reward_staging": 0.546875, | |
| "step": 373 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 503.234375, | |
| "epoch": 373.8, | |
| "grad_norm": 1.8905690009692866, | |
| "kl": 0.56494140625, | |
| "learning_rate": 2.5629935214764864e-08, | |
| "loss": 0.0793, | |
| "reward": 10.550000071525574, | |
| "reward_std": 7.391293793916702, | |
| "rewards/accuracy_reward_staging": 0.931562501937151, | |
| "rewards/format_reward": 0.640625, | |
| "rewards/format_reward_staging": 0.59375, | |
| "step": 374 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 518.109375, | |
| "epoch": 374.8, | |
| "grad_norm": 4.31839552805722, | |
| "kl": 0.7208251953125, | |
| "learning_rate": 2.3703992880066636e-08, | |
| "loss": 0.0576, | |
| "reward": 11.893750101327896, | |
| "reward_std": 7.522340267896652, | |
| "rewards/accuracy_reward_staging": 1.0612500254064798, | |
| "rewards/format_reward": 0.65625, | |
| "rewards/format_reward_staging": 0.625, | |
| "step": 375 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 560.953125, | |
| "epoch": 375.8, | |
| "grad_norm": 2.2893855348554824, | |
| "kl": 0.4915771484375, | |
| "learning_rate": 2.185239926619431e-08, | |
| "loss": 0.0645, | |
| "reward": 11.574999958276749, | |
| "reward_std": 8.68027800321579, | |
| "rewards/accuracy_reward_staging": 1.0246875025331974, | |
| "rewards/format_reward": 0.671875, | |
| "rewards/format_reward_staging": 0.65625, | |
| "step": 376 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 503.046875, | |
| "epoch": 376.8, | |
| "grad_norm": 2.0521716339261573, | |
| "kl": 0.5130615234375, | |
| "learning_rate": 2.007529537917041e-08, | |
| "loss": 0.0747, | |
| "reward": 11.942187368869781, | |
| "reward_std": 6.6959647461771965, | |
| "rewards/accuracy_reward_staging": 1.0660937773063779, | |
| "rewards/format_reward": 0.640625, | |
| "rewards/format_reward_staging": 0.640625, | |
| "step": 377 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 544.703125, | |
| "epoch": 377.8, | |
| "grad_norm": 2.6744855588179597, | |
| "kl": 0.6314697265625, | |
| "learning_rate": 1.8372816552336023e-08, | |
| "loss": 0.0892, | |
| "reward": 10.899999856948853, | |
| "reward_std": 7.818521216511726, | |
| "rewards/accuracy_reward_staging": 0.957187520340085, | |
| "rewards/format_reward": 0.6875, | |
| "rewards/format_reward_staging": 0.640625, | |
| "step": 378 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 536.0, | |
| "epoch": 378.8, | |
| "grad_norm": 2.1993782801564827, | |
| "kl": 0.5281982421875, | |
| "learning_rate": 1.6745092436045492e-08, | |
| "loss": 0.09, | |
| "reward": 10.765625104308128, | |
| "reward_std": 9.52804271876812, | |
| "rewards/accuracy_reward_staging": 0.9468750059604645, | |
| "rewards/format_reward": 0.671875, | |
| "rewards/format_reward_staging": 0.625, | |
| "step": 379 | |
| }, | |
| { | |
| "epoch": 379.8, | |
| "grad_norm": 2.850728275624988, | |
| "learning_rate": 1.519224698779198e-08, | |
| "loss": 0.0848, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 379.8, | |
| "eval_clip_ratio": 0.0, | |
| "eval_completion_length": 504.675, | |
| "eval_kl": 0.4578125, | |
| "eval_loss": 0.07416948676109314, | |
| "eval_reward": 11.396249985694885, | |
| "eval_reward_std": 8.601353228092194, | |
| "eval_rewards/accuracy_reward_staging": 1.0146250143647193, | |
| "eval_rewards/format_reward": 0.6375, | |
| "eval_rewards/format_reward_staging": 0.6125, | |
| "eval_runtime": 121.4493, | |
| "eval_samples_per_second": 0.165, | |
| "eval_steps_per_second": 0.041, | |
| "step": 380 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 510.578125, | |
| "epoch": 380.8, | |
| "grad_norm": 2.9736298141622295, | |
| "kl": 0.55657958984375, | |
| "learning_rate": 1.3714398462768562e-08, | |
| "loss": 0.0869, | |
| "reward": 12.212500043213367, | |
| "reward_std": 7.73147202283144, | |
| "rewards/accuracy_reward_staging": 1.0837499964982271, | |
| "rewards/format_reward": 0.6953125, | |
| "rewards/format_reward_staging": 0.6796875, | |
| "step": 381 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 522.1875, | |
| "epoch": 381.8, | |
| "grad_norm": 2.0373989754015125, | |
| "kl": 0.5072021484375, | |
| "learning_rate": 1.231165940486234e-08, | |
| "loss": 0.034, | |
| "reward": 12.49843743443489, | |
| "reward_std": 6.0733470767736435, | |
| "rewards/accuracy_reward_staging": 1.1014062575995922, | |
| "rewards/format_reward": 0.734375, | |
| "rewards/format_reward_staging": 0.75, | |
| "step": 382 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 498.09375, | |
| "epoch": 382.8, | |
| "grad_norm": 2.0529799885966997, | |
| "kl": 0.5218505859375, | |
| "learning_rate": 1.0984136638083175e-08, | |
| "loss": 0.0674, | |
| "reward": 12.098437368869781, | |
| "reward_std": 7.62692953646183, | |
| "rewards/accuracy_reward_staging": 1.0817187586799264, | |
| "rewards/format_reward": 0.640625, | |
| "rewards/format_reward_staging": 0.640625, | |
| "step": 383 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 523.203125, | |
| "epoch": 383.8, | |
| "grad_norm": 2.0301601145066495, | |
| "kl": 0.521240234375, | |
| "learning_rate": 9.731931258429638e-09, | |
| "loss": 0.059, | |
| "reward": 10.729687452316284, | |
| "reward_std": 5.984116218984127, | |
| "rewards/accuracy_reward_staging": 0.9464062480255961, | |
| "rewards/format_reward": 0.671875, | |
| "rewards/format_reward_staging": 0.59375, | |
| "step": 384 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 502.109375, | |
| "epoch": 384.8, | |
| "grad_norm": 4.517612366443085, | |
| "kl": 0.602294921875, | |
| "learning_rate": 8.555138626189618e-09, | |
| "loss": 0.0919, | |
| "reward": 10.614062532782555, | |
| "reward_std": 7.197298094630241, | |
| "rewards/accuracy_reward_staging": 0.9317187555134296, | |
| "rewards/format_reward": 0.65625, | |
| "rewards/format_reward_staging": 0.640625, | |
| "step": 385 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 531.5625, | |
| "epoch": 385.8, | |
| "grad_norm": 1.996721078816804, | |
| "kl": 0.542236328125, | |
| "learning_rate": 7.453848358678017e-09, | |
| "loss": 0.0893, | |
| "reward": 12.423437505960464, | |
| "reward_std": 7.174239456653595, | |
| "rewards/accuracy_reward_staging": 1.0970312654972076, | |
| "rewards/format_reward": 0.765625, | |
| "rewards/format_reward_staging": 0.6875, | |
| "step": 386 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 529.5, | |
| "epoch": 386.8, | |
| "grad_norm": 1.6692450152429934, | |
| "kl": 0.4757080078125, | |
| "learning_rate": 6.4281443234125434e-09, | |
| "loss": 0.0622, | |
| "reward": 10.457812458276749, | |
| "reward_std": 8.408779114484787, | |
| "rewards/accuracy_reward_staging": 0.9207812584936619, | |
| "rewards/format_reward": 0.6875, | |
| "rewards/format_reward_staging": 0.5625, | |
| "step": 387 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 516.28125, | |
| "epoch": 387.8, | |
| "grad_norm": 72.65003366147363, | |
| "kl": 1.12158203125, | |
| "learning_rate": 5.47810463172671e-09, | |
| "loss": 0.0677, | |
| "reward": 14.623437404632568, | |
| "reward_std": 7.604200206696987, | |
| "rewards/accuracy_reward_staging": 1.3154687583446503, | |
| "rewards/format_reward": 0.75, | |
| "rewards/format_reward_staging": 0.71875, | |
| "step": 388 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 540.546875, | |
| "epoch": 388.8, | |
| "grad_norm": 2.13782688418701, | |
| "kl": 0.5604248046875, | |
| "learning_rate": 4.603801632821147e-09, | |
| "loss": 0.0741, | |
| "reward": 11.571875154972076, | |
| "reward_std": 8.498483955860138, | |
| "rewards/accuracy_reward_staging": 1.0556250140070915, | |
| "rewards/format_reward": 0.515625, | |
| "rewards/format_reward_staging": 0.5, | |
| "step": 389 | |
| }, | |
| { | |
| "epoch": 389.8, | |
| "grad_norm": 2.550655354845835, | |
| "learning_rate": 3.805301908254455e-09, | |
| "loss": 0.0785, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 389.8, | |
| "eval_clip_ratio": 0.0, | |
| "eval_completion_length": 543.7, | |
| "eval_kl": 0.77880859375, | |
| "eval_loss": 0.05070864409208298, | |
| "eval_reward": 11.214999914169312, | |
| "eval_reward_std": 7.508172661066055, | |
| "eval_rewards/accuracy_reward_staging": 1.0102500110864638, | |
| "eval_rewards/format_reward": 0.575, | |
| "eval_rewards/format_reward_staging": 0.5375, | |
| "eval_runtime": 147.1522, | |
| "eval_samples_per_second": 0.136, | |
| "eval_steps_per_second": 0.034, | |
| "step": 390 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 514.140625, | |
| "epoch": 390.8, | |
| "grad_norm": 2.7881900199384537, | |
| "kl": 0.58929443359375, | |
| "learning_rate": 3.082666266872036e-09, | |
| "loss": 0.0336, | |
| "reward": 12.8234374076128, | |
| "reward_std": 7.5515576638281345, | |
| "rewards/accuracy_reward_staging": 1.1495312573388219, | |
| "rewards/format_reward": 0.6640625, | |
| "rewards/format_reward_staging": 0.6640625, | |
| "step": 391 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 524.25, | |
| "epoch": 391.8, | |
| "grad_norm": 2.166883481802826, | |
| "kl": 0.585693359375, | |
| "learning_rate": 2.435949740175802e-09, | |
| "loss": 0.0693, | |
| "reward": 9.915624856948853, | |
| "reward_std": 7.9057832062244415, | |
| "rewards/accuracy_reward_staging": 0.8790625054389238, | |
| "rewards/format_reward": 0.578125, | |
| "rewards/format_reward_staging": 0.546875, | |
| "step": 392 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 558.03125, | |
| "epoch": 392.8, | |
| "grad_norm": 2.013874424977512, | |
| "kl": 0.5469970703125, | |
| "learning_rate": 1.86520157813308e-09, | |
| "loss": -0.0047, | |
| "reward": 10.384374856948853, | |
| "reward_std": 7.076003402471542, | |
| "rewards/accuracy_reward_staging": 0.9103124821558595, | |
| "rewards/format_reward": 0.65625, | |
| "rewards/format_reward_staging": 0.625, | |
| "step": 393 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 573.859375, | |
| "epoch": 393.8, | |
| "grad_norm": 4.782888144712166, | |
| "kl": 0.7337646484375, | |
| "learning_rate": 1.3704652454261667e-09, | |
| "loss": 0.0785, | |
| "reward": 10.860937401652336, | |
| "reward_std": 8.005559802055359, | |
| "rewards/accuracy_reward_staging": 0.9782812423072755, | |
| "rewards/format_reward": 0.546875, | |
| "rewards/format_reward_staging": 0.53125, | |
| "step": 394 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 509.453125, | |
| "epoch": 394.8, | |
| "grad_norm": 5.775714842908385, | |
| "kl": 0.88623046875, | |
| "learning_rate": 9.517784181422018e-10, | |
| "loss": 0.0558, | |
| "reward": 10.265624895691872, | |
| "reward_std": 7.404541149735451, | |
| "rewards/accuracy_reward_staging": 0.9109375043772161, | |
| "rewards/format_reward": 0.609375, | |
| "rewards/format_reward_staging": 0.546875, | |
| "step": 395 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 543.453125, | |
| "epoch": 395.8, | |
| "grad_norm": 2.0434917013326266, | |
| "kl": 0.5186767578125, | |
| "learning_rate": 6.091729809042379e-10, | |
| "loss": 0.0542, | |
| "reward": 11.81874991953373, | |
| "reward_std": 9.100275874137878, | |
| "rewards/accuracy_reward_staging": 1.0662500150501728, | |
| "rewards/format_reward": 0.609375, | |
| "rewards/format_reward_staging": 0.546875, | |
| "step": 396 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 546.1875, | |
| "epoch": 396.8, | |
| "grad_norm": 1.7418802984126296, | |
| "kl": 0.4542236328125, | |
| "learning_rate": 3.426750244427401e-10, | |
| "loss": 0.07, | |
| "reward": 12.928124815225601, | |
| "reward_std": 7.5549889877438545, | |
| "rewards/accuracy_reward_staging": 1.1490625096485019, | |
| "rewards/format_reward": 0.71875, | |
| "rewards/format_reward_staging": 0.71875, | |
| "step": 397 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 492.109375, | |
| "epoch": 397.8, | |
| "grad_norm": 3.724982054081198, | |
| "kl": 0.7119140625, | |
| "learning_rate": 1.5230484360873042e-10, | |
| "loss": 0.0322, | |
| "reward": 12.485937371850014, | |
| "reward_std": 7.725762560963631, | |
| "rewards/accuracy_reward_staging": 1.1079687606543303, | |
| "rewards/format_reward": 0.734375, | |
| "rewards/format_reward_staging": 0.671875, | |
| "step": 398 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 550.203125, | |
| "epoch": 398.8, | |
| "grad_norm": 7.1968629953818315, | |
| "kl": 0.607421875, | |
| "learning_rate": 3.8076935828690315e-11, | |
| "loss": 0.1134, | |
| "reward": 9.825000047683716, | |
| "reward_std": 8.716731041669846, | |
| "rewards/accuracy_reward_staging": 0.8637500181794167, | |
| "rewards/format_reward": 0.640625, | |
| "rewards/format_reward_staging": 0.546875, | |
| "step": 399 | |
| }, | |
| { | |
| "epoch": 399.8, | |
| "grad_norm": 2.578825219229897, | |
| "learning_rate": 0.0, | |
| "loss": 0.0625, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 399.8, | |
| "eval_clip_ratio": 0.0, | |
| "eval_completion_length": 564.25, | |
| "eval_kl": 0.541796875, | |
| "eval_loss": 0.1237805113196373, | |
| "eval_reward": 12.454999792575837, | |
| "eval_reward_std": 7.5952778339385985, | |
| "eval_rewards/accuracy_reward_staging": 1.1054999977350235, | |
| "eval_rewards/format_reward": 0.7125, | |
| "eval_rewards/format_reward_staging": 0.6875, | |
| "eval_runtime": 144.5952, | |
| "eval_samples_per_second": 0.138, | |
| "eval_steps_per_second": 0.035, | |
| "step": 400 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 498.03125, | |
| "epoch": 399.8, | |
| "kl": 0.4832763671875, | |
| "reward": 14.412499785423279, | |
| "reward_std": 7.106586746871471, | |
| "rewards/accuracy_reward_staging": 1.294375006109476, | |
| "rewards/format_reward": 0.734375, | |
| "rewards/format_reward_staging": 0.734375, | |
| "step": 400, | |
| "total_flos": 0.0, | |
| "train_loss": 1.0999555667603271, | |
| "train_runtime": 38247.6092, | |
| "train_samples_per_second": 0.209, | |
| "train_steps_per_second": 0.01 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 400, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 400, | |
| "save_steps": 50, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |