diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,6114 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 99.88888888888889, + "eval_steps": 50, + "global_step": 400, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio": 0.0, + "completion_length": 535.125, + "epoch": 0.2222222222222222, + "grad_norm": 1.7916724271880604, + "kl": 0.0, + "learning_rate": 5e-08, + "loss": 0.0583, + "reward": 2.3125, + "reward_std": 1.1971687823534012, + "rewards/accuracy_reward_staging": 0.09375, + "rewards/format_reward": 0.875, + "rewards/format_reward_staging": 0.96875, + "step": 1 + }, + { + "clip_ratio": 0.0, + "completion_length": 555.90625, + "epoch": 0.4444444444444444, + "grad_norm": 1.5555075403521712, + "kl": 0.0, + "learning_rate": 1e-07, + "loss": -0.0705, + "reward": 2.5625, + "reward_std": 1.2858919501304626, + "rewards/accuracy_reward_staging": 0.15625, + "rewards/format_reward": 0.84375, + "rewards/format_reward_staging": 0.9375, + "step": 2 + }, + { + "clip_ratio": 0.0, + "completion_length": 541.46875, + "epoch": 0.6666666666666666, + "grad_norm": 1.6594522931688669, + "kl": 0.0010576248168945312, + "learning_rate": 1.5e-07, + "loss": -0.0235, + "reward": 2.59375, + "reward_std": 1.6232599020004272, + "rewards/accuracy_reward_staging": 0.15625, + "rewards/format_reward": 0.90625, + "rewards/format_reward_staging": 0.90625, + "step": 3 + }, + { + "clip_ratio": 0.0, + "completion_length": 613.25, + "epoch": 0.8888888888888888, + "grad_norm": 2.3276142189283164, + "kl": 0.0011081695556640625, + "learning_rate": 2e-07, + "loss": 0.1029, + "reward": 2.875, + "reward_std": 1.8071783781051636, + "rewards/accuracy_reward_staging": 0.21875, + "rewards/format_reward": 0.8125, + "rewards/format_reward_staging": 0.96875, + "step": 4 + }, + { + "clip_ratio": 0.0, + "completion_length": 649.84375, + "epoch": 1.2222222222222223, + "grad_norm": 1.5167959821278052, + "kl": 0.0010709762573242188, + "learning_rate": 2.5e-07, + "loss": 0.0003, + "reward": 2.84375, + "reward_std": 1.7606024742126465, + "rewards/accuracy_reward_staging": 0.21875, + "rewards/format_reward": 0.78125, + "rewards/format_reward_staging": 0.96875, + "step": 5 + }, + { + "clip_ratio": 0.0, + "completion_length": 574.25, + "epoch": 1.4444444444444444, + "grad_norm": 1.491122536644779, + "kl": 0.0009145736694335938, + "learning_rate": 3e-07, + "loss": 0.0377, + "reward": 2.75, + "reward_std": 1.8017165958881378, + "rewards/accuracy_reward_staging": 0.1875, + "rewards/format_reward": 0.90625, + "rewards/format_reward_staging": 0.90625, + "step": 6 + }, + { + "clip_ratio": 0.0, + "completion_length": 550.46875, + "epoch": 1.6666666666666665, + "grad_norm": 1.5321454699600687, + "kl": 0.0016422271728515625, + "learning_rate": 3.5e-07, + "loss": 0.0173, + "reward": 2.8125, + "reward_std": 1.498587191104889, + "rewards/accuracy_reward_staging": 0.1875, + "rewards/format_reward": 0.90625, + "rewards/format_reward_staging": 0.96875, + "step": 7 + }, + { + "clip_ratio": 0.0, + "completion_length": 530.0625, + "epoch": 1.8888888888888888, + "grad_norm": 1.7429693147530465, + "kl": 0.0010614395141601562, + "learning_rate": 4e-07, + "loss": 0.0413, + "reward": 3.15625, + "reward_std": 2.1272581219673157, + "rewards/accuracy_reward_staging": 0.28125, + "rewards/format_reward": 0.78125, + "rewards/format_reward_staging": 0.96875, + "step": 8 + }, + { + "clip_ratio": 0.0, + "completion_length": 643.875, + "epoch": 2.2222222222222223, + "grad_norm": 1.53726074310182, + "kl": 0.0013751983642578125, + "learning_rate": 4.5e-07, + "loss": -0.005, + "reward": 3.125, + "reward_std": 2.054091453552246, + "rewards/accuracy_reward_staging": 0.25, + "rewards/format_reward": 0.90625, + "rewards/format_reward_staging": 0.96875, + "step": 9 + }, + { + "clip_ratio": 0.0, + "completion_length": 614.09375, + "epoch": 2.4444444444444446, + "grad_norm": 1.3654100960829842, + "kl": 0.0012149810791015625, + "learning_rate": 5e-07, + "loss": -0.0164, + "reward": 2.59375, + "reward_std": 1.0483438968658447, + "rewards/accuracy_reward_staging": 0.125, + "rewards/format_reward": 0.96875, + "rewards/format_reward_staging": 1.0, + "step": 10 + }, + { + "clip_ratio": 0.0, + "completion_length": 555.3125, + "epoch": 2.6666666666666665, + "grad_norm": 1.4260001116361793, + "kl": 0.0010051727294921875, + "learning_rate": 5.5e-07, + "loss": 0.0251, + "reward": 3.0625, + "reward_std": 1.7733518332242966, + "rewards/accuracy_reward_staging": 0.25, + "rewards/format_reward": 0.84375, + "rewards/format_reward_staging": 0.96875, + "step": 11 + }, + { + "clip_ratio": 0.0, + "completion_length": 552.5625, + "epoch": 2.888888888888889, + "grad_norm": 1.5253120629648043, + "kl": 0.001361846923828125, + "learning_rate": 6e-07, + "loss": 0.0285, + "reward": 3.3125, + "reward_std": 1.9136751294136047, + "rewards/accuracy_reward_staging": 0.28125, + "rewards/format_reward": 0.90625, + "rewards/format_reward_staging": 1.0, + "step": 12 + }, + { + "clip_ratio": 0.0, + "completion_length": 579.71875, + "epoch": 3.2222222222222223, + "grad_norm": 1.5612924435198745, + "kl": 0.0019207000732421875, + "learning_rate": 6.5e-07, + "loss": 0.0829, + "reward": 2.0625, + "reward_std": 0.5475594997406006, + "rewards/accuracy_reward_staging": 0.0625, + "rewards/format_reward": 0.8125, + "rewards/format_reward_staging": 0.9375, + "step": 13 + }, + { + "clip_ratio": 0.0, + "completion_length": 648.125, + "epoch": 3.4444444444444446, + "grad_norm": 1.472369166378751, + "kl": 0.0019435882568359375, + "learning_rate": 7e-07, + "loss": 0.0889, + "reward": 2.3125, + "reward_std": 1.3669461011886597, + "rewards/accuracy_reward_staging": 0.125, + "rewards/format_reward": 0.84375, + "rewards/format_reward_staging": 0.84375, + "step": 14 + }, + { + "clip_ratio": 0.0, + "completion_length": 660.78125, + "epoch": 3.6666666666666665, + "grad_norm": 1.2833764786982476, + "kl": 0.00171661376953125, + "learning_rate": 7.5e-07, + "loss": -0.0032, + "reward": 2.28125, + "reward_std": 0.9946783781051636, + "rewards/accuracy_reward_staging": 0.09375, + "rewards/format_reward": 0.84375, + "rewards/format_reward_staging": 0.96875, + "step": 15 + }, + { + "clip_ratio": 0.0, + "completion_length": 555.625, + "epoch": 3.888888888888889, + "grad_norm": 1.7981216304584955, + "kl": 0.003185272216796875, + "learning_rate": 8e-07, + "loss": 0.0022, + "reward": 4.09375, + "reward_std": 2.7086294293403625, + "rewards/accuracy_reward_staging": 0.4375, + "rewards/format_reward": 0.9375, + "rewards/format_reward_staging": 0.96875, + "step": 16 + }, + { + "clip_ratio": 0.0, + "completion_length": 581.375, + "epoch": 4.222222222222222, + "grad_norm": 1.8924801483136653, + "kl": 0.003849029541015625, + "learning_rate": 8.499999999999999e-07, + "loss": 0.0192, + "reward": 2.8125, + "reward_std": 1.4357599020004272, + "rewards/accuracy_reward_staging": 0.1875, + "rewards/format_reward": 0.90625, + "rewards/format_reward_staging": 0.96875, + "step": 17 + }, + { + "clip_ratio": 0.0, + "completion_length": 626.875, + "epoch": 4.444444444444445, + "grad_norm": 1.4237753323985947, + "kl": 0.004940032958984375, + "learning_rate": 9e-07, + "loss": 0.0048, + "reward": 2.78125, + "reward_std": 1.6875, + "rewards/accuracy_reward_staging": 0.1875, + "rewards/format_reward": 0.875, + "rewards/format_reward_staging": 0.96875, + "step": 18 + }, + { + "clip_ratio": 0.0, + "completion_length": 583.875, + "epoch": 4.666666666666667, + "grad_norm": 1.4401282377616447, + "kl": 0.00505828857421875, + "learning_rate": 9.499999999999999e-07, + "loss": 0.0016, + "reward": 3.4375, + "reward_std": 2.3147872537374496, + "rewards/accuracy_reward_staging": 0.34375, + "rewards/format_reward": 0.8125, + "rewards/format_reward_staging": 0.90625, + "step": 19 + }, + { + "clip_ratio": 0.0, + "completion_length": 562.8125, + "epoch": 4.888888888888889, + "grad_norm": 1.1629869227175655, + "kl": 0.00585174560546875, + "learning_rate": 1e-06, + "loss": 0.0, + "reward": 2.5625, + "reward_std": 0.9797460436820984, + "rewards/accuracy_reward_staging": 0.125, + "rewards/format_reward": 0.9375, + "rewards/format_reward_staging": 1.0, + "step": 20 + }, + { + "clip_ratio": 0.0, + "completion_length": 614.875, + "epoch": 5.222222222222222, + "grad_norm": 1.6115188653051613, + "kl": 0.00612640380859375, + "learning_rate": 9.999829128320873e-07, + "loss": 0.0565, + "reward": 3.28125, + "reward_std": 2.4976893961429596, + "rewards/accuracy_reward_staging": 0.28125, + "rewards/format_reward": 0.90625, + "rewards/format_reward_staging": 0.96875, + "step": 21 + }, + { + "clip_ratio": 0.0, + "completion_length": 559.46875, + "epoch": 5.444444444444445, + "grad_norm": 1.465512353981508, + "kl": 0.00824737548828125, + "learning_rate": 9.999316524962345e-07, + "loss": 0.0541, + "reward": 3.3125, + "reward_std": 1.8101893961429596, + "rewards/accuracy_reward_staging": 0.28125, + "rewards/format_reward": 0.9375, + "rewards/format_reward_staging": 0.96875, + "step": 22 + }, + { + "clip_ratio": 0.0, + "completion_length": 572.59375, + "epoch": 5.666666666666667, + "grad_norm": 1.5847579776558225, + "kl": 0.0093841552734375, + "learning_rate": 9.998462224960173e-07, + "loss": 0.06, + "reward": 3.6875, + "reward_std": 2.443375587463379, + "rewards/accuracy_reward_staging": 0.34375, + "rewards/format_reward": 0.96875, + "rewards/format_reward_staging": 1.0, + "step": 23 + }, + { + "clip_ratio": 0.0, + "completion_length": 645.9375, + "epoch": 5.888888888888889, + "grad_norm": 1.8362203993654154, + "kl": 0.00734710693359375, + "learning_rate": 9.99726628670463e-07, + "loss": 0.0368, + "reward": 3.03125, + "reward_std": 2.283504918217659, + "rewards/accuracy_reward_staging": 0.28125, + "rewards/format_reward": 0.71875, + "rewards/format_reward_staging": 0.90625, + "step": 24 + }, + { + "clip_ratio": 0.0, + "completion_length": 572.5, + "epoch": 6.222222222222222, + "grad_norm": 1.6415108932304052, + "kl": 0.0096588134765625, + "learning_rate": 9.995728791936505e-07, + "loss": 0.0267, + "reward": 2.96875, + "reward_std": 1.7760016024112701, + "rewards/accuracy_reward_staging": 0.21875, + "rewards/format_reward": 0.90625, + "rewards/format_reward_staging": 0.96875, + "step": 25 + }, + { + "clip_ratio": 0.0, + "completion_length": 579.96875, + "epoch": 6.444444444444445, + "grad_norm": 1.4689069714869325, + "kl": 0.010345458984375, + "learning_rate": 9.993849845741523e-07, + "loss": 0.1034, + "reward": 2.5625, + "reward_std": 1.1108438968658447, + "rewards/accuracy_reward_staging": 0.15625, + "rewards/format_reward": 0.875, + "rewards/format_reward_staging": 0.90625, + "step": 26 + }, + { + "clip_ratio": 0.0, + "completion_length": 542.46875, + "epoch": 6.666666666666667, + "grad_norm": 1.7253968854719324, + "kl": 0.01122283935546875, + "learning_rate": 9.991629576543163e-07, + "loss": -0.0129, + "reward": 2.625, + "reward_std": 1.316565990447998, + "rewards/accuracy_reward_staging": 0.1875, + "rewards/format_reward": 0.78125, + "rewards/format_reward_staging": 0.90625, + "step": 27 + }, + { + "clip_ratio": 0.0, + "completion_length": 598.0, + "epoch": 6.888888888888889, + "grad_norm": 1.439672104037944, + "kl": 0.0132293701171875, + "learning_rate": 9.989068136093872e-07, + "loss": 0.0324, + "reward": 3.375, + "reward_std": 2.423195868730545, + "rewards/accuracy_reward_staging": 0.3125, + "rewards/format_reward": 0.875, + "rewards/format_reward_staging": 0.9375, + "step": 28 + }, + { + "clip_ratio": 0.0, + "completion_length": 557.5625, + "epoch": 7.222222222222222, + "grad_norm": 1.53093980357088, + "kl": 0.0146942138671875, + "learning_rate": 9.986165699464705e-07, + "loss": -0.0074, + "reward": 3.125, + "reward_std": 2.0308370888233185, + "rewards/accuracy_reward_staging": 0.25, + "rewards/format_reward": 0.90625, + "rewards/format_reward_staging": 0.96875, + "step": 29 + }, + { + "clip_ratio": 0.0, + "completion_length": 574.90625, + "epoch": 7.444444444444445, + "grad_norm": 1.0715134693817079, + "kl": 0.0147857666015625, + "learning_rate": 9.982922465033348e-07, + "loss": -0.0166, + "reward": 2.5, + "reward_std": 0.9858438968658447, + "rewards/accuracy_reward_staging": 0.125, + "rewards/format_reward": 0.875, + "rewards/format_reward_staging": 1.0, + "step": 30 + }, + { + "clip_ratio": 0.0, + "completion_length": 602.65625, + "epoch": 7.666666666666667, + "grad_norm": 1.4389686833903352, + "kl": 0.01611328125, + "learning_rate": 9.979338654470567e-07, + "loss": 0.0875, + "reward": 2.4375, + "reward_std": 1.2930222898721695, + "rewards/accuracy_reward_staging": 0.125, + "rewards/format_reward": 0.875, + "rewards/format_reward_staging": 0.9375, + "step": 31 + }, + { + "clip_ratio": 0.0, + "completion_length": 669.5625, + "epoch": 7.888888888888889, + "grad_norm": 1.0489321524468773, + "kl": 0.01910400390625, + "learning_rate": 9.975414512725056e-07, + "loss": 0.0185, + "reward": 2.5625, + "reward_std": 1.037847101688385, + "rewards/accuracy_reward_staging": 0.125, + "rewards/format_reward": 0.96875, + "rewards/format_reward_staging": 0.96875, + "step": 32 + }, + { + "clip_ratio": 0.0, + "completion_length": 532.625, + "epoch": 8.222222222222221, + "grad_norm": 1.5157291140048736, + "kl": 0.01885986328125, + "learning_rate": 9.971150308006687e-07, + "loss": -0.0001, + "reward": 4.125, + "reward_std": 2.000675529241562, + "rewards/accuracy_reward_staging": 0.4375, + "rewards/format_reward": 0.9375, + "rewards/format_reward_staging": 1.0, + "step": 33 + }, + { + "clip_ratio": 0.0, + "completion_length": 591.03125, + "epoch": 8.444444444444445, + "grad_norm": 1.5963578785319679, + "kl": 0.0192413330078125, + "learning_rate": 9.966546331768192e-07, + "loss": 0.1269, + "reward": 2.875, + "reward_std": 2.112294152379036, + "rewards/accuracy_reward_staging": 0.25, + "rewards/format_reward": 0.78125, + "rewards/format_reward_staging": 0.84375, + "step": 34 + }, + { + "clip_ratio": 0.0, + "completion_length": 603.09375, + "epoch": 8.666666666666666, + "grad_norm": 1.4508455813252856, + "kl": 0.01494598388671875, + "learning_rate": 9.961602898685223e-07, + "loss": 0.0585, + "reward": 3.3125, + "reward_std": 2.0126227736473083, + "rewards/accuracy_reward_staging": 0.3125, + "rewards/format_reward": 0.8125, + "rewards/format_reward_staging": 0.9375, + "step": 35 + }, + { + "clip_ratio": 0.0, + "completion_length": 579.9375, + "epoch": 8.88888888888889, + "grad_norm": 1.196537394176258, + "kl": 0.0169830322265625, + "learning_rate": 9.956320346634875e-07, + "loss": 0.0166, + "reward": 2.78125, + "reward_std": 1.3710740953683853, + "rewards/accuracy_reward_staging": 0.1875, + "rewards/format_reward": 0.84375, + "rewards/format_reward_staging": 1.0, + "step": 36 + }, + { + "clip_ratio": 0.0, + "completion_length": 572.96875, + "epoch": 9.222222222222221, + "grad_norm": 1.4031846103728705, + "kl": 0.0164794921875, + "learning_rate": 9.95069903667256e-07, + "loss": 0.0257, + "reward": 2.65625, + "reward_std": 1.4369846880435944, + "rewards/accuracy_reward_staging": 0.1875, + "rewards/format_reward": 0.8125, + "rewards/format_reward_staging": 0.90625, + "step": 37 + }, + { + "clip_ratio": 0.0, + "completion_length": 533.8125, + "epoch": 9.444444444444445, + "grad_norm": 1.7378697171564481, + "kl": 0.019744873046875, + "learning_rate": 9.944739353007341e-07, + "loss": 0.0651, + "reward": 3.6875, + "reward_std": 2.841255784034729, + "rewards/accuracy_reward_staging": 0.375, + "rewards/format_reward": 0.84375, + "rewards/format_reward_staging": 0.96875, + "step": 38 + }, + { + "clip_ratio": 0.0, + "completion_length": 575.5625, + "epoch": 9.666666666666666, + "grad_norm": 1.6742496883549038, + "kl": 0.018218994140625, + "learning_rate": 9.938441702975689e-07, + "loss": 0.0249, + "reward": 2.4375, + "reward_std": 1.2126952707767487, + "rewards/accuracy_reward_staging": 0.125, + "rewards/format_reward": 0.84375, + "rewards/format_reward_staging": 0.96875, + "step": 39 + }, + { + "clip_ratio": 0.0, + "completion_length": 541.375, + "epoch": 9.88888888888889, + "grad_norm": 1.6853780037379804, + "kl": 0.0196533203125, + "learning_rate": 9.931806517013612e-07, + "loss": 0.0121, + "reward": 2.5625, + "reward_std": 1.3815238624811172, + "rewards/accuracy_reward_staging": 0.15625, + "rewards/format_reward": 0.84375, + "rewards/format_reward_staging": 0.9375, + "step": 40 + }, + { + "clip_ratio": 0.0, + "completion_length": 550.90625, + "epoch": 10.222222222222221, + "grad_norm": 1.2047759950332129, + "kl": 0.017730712890625, + "learning_rate": 9.924834248627258e-07, + "loss": 0.0398, + "reward": 2.8125, + "reward_std": 1.4487498700618744, + "rewards/accuracy_reward_staging": 0.1875, + "rewards/format_reward": 0.90625, + "rewards/format_reward_staging": 0.96875, + "step": 41 + }, + { + "clip_ratio": 0.0, + "completion_length": 587.34375, + "epoch": 10.444444444444445, + "grad_norm": 2.2662890327219642, + "kl": 0.032135009765625, + "learning_rate": 9.917525374361911e-07, + "loss": 0.0402, + "reward": 3.375, + "reward_std": 2.6460810601711273, + "rewards/accuracy_reward_staging": 0.3125, + "rewards/format_reward": 0.84375, + "rewards/format_reward_staging": 0.96875, + "step": 42 + }, + { + "clip_ratio": 0.0, + "completion_length": 552.4375, + "epoch": 10.666666666666666, + "grad_norm": 0.8485843884389722, + "kl": 0.021148681640625, + "learning_rate": 9.909880393769418e-07, + "loss": 0.0349, + "reward": 2.5, + "reward_std": 1.045437604188919, + "rewards/accuracy_reward_staging": 0.125, + "rewards/format_reward": 0.9375, + "rewards/format_reward_staging": 0.9375, + "step": 43 + }, + { + "clip_ratio": 0.0, + "completion_length": 626.625, + "epoch": 10.88888888888889, + "grad_norm": 1.4242611362455049, + "kl": 0.018280029296875, + "learning_rate": 9.901899829374047e-07, + "loss": 0.0405, + "reward": 3.03125, + "reward_std": 2.107846677303314, + "rewards/accuracy_reward_staging": 0.25, + "rewards/format_reward": 0.875, + "rewards/format_reward_staging": 0.90625, + "step": 44 + }, + { + "clip_ratio": 0.0, + "completion_length": 536.8125, + "epoch": 11.222222222222221, + "grad_norm": 1.6000184113652984, + "kl": 0.025238037109375, + "learning_rate": 9.893584226636772e-07, + "loss": -0.0471, + "reward": 2.78125, + "reward_std": 1.6772827804088593, + "rewards/accuracy_reward_staging": 0.1875, + "rewards/format_reward": 0.84375, + "rewards/format_reward_staging": 1.0, + "step": 45 + }, + { + "clip_ratio": 0.0, + "completion_length": 588.65625, + "epoch": 11.444444444444445, + "grad_norm": 1.2633801476740014, + "kl": 0.02093505859375, + "learning_rate": 9.884934153917996e-07, + "loss": 0.027, + "reward": 2.4375, + "reward_std": 1.226884126663208, + "rewards/accuracy_reward_staging": 0.125, + "rewards/format_reward": 0.875, + "rewards/format_reward_staging": 0.9375, + "step": 46 + }, + { + "clip_ratio": 0.0, + "completion_length": 524.1875, + "epoch": 11.666666666666666, + "grad_norm": 1.7228504370636915, + "kl": 0.020599365234375, + "learning_rate": 9.8759502024387e-07, + "loss": -0.0016, + "reward": 3.125, + "reward_std": 1.8041669130325317, + "rewards/accuracy_reward_staging": 0.28125, + "rewards/format_reward": 0.8125, + "rewards/format_reward_staging": 0.90625, + "step": 47 + }, + { + "clip_ratio": 0.0, + "completion_length": 572.875, + "epoch": 11.88888888888889, + "grad_norm": 7.859881611636793, + "kl": 0.063079833984375, + "learning_rate": 9.866632986240029e-07, + "loss": 0.0482, + "reward": 3.25, + "reward_std": 2.0755133628845215, + "rewards/accuracy_reward_staging": 0.28125, + "rewards/format_reward": 0.875, + "rewards/format_reward_staging": 0.96875, + "step": 48 + }, + { + "clip_ratio": 0.0, + "completion_length": 577.5625, + "epoch": 12.222222222222221, + "grad_norm": 1.7851397304147796, + "kl": 0.0205078125, + "learning_rate": 9.856983142141337e-07, + "loss": 0.0509, + "reward": 3.3125, + "reward_std": 2.14286145567894, + "rewards/accuracy_reward_staging": 0.3125, + "rewards/format_reward": 0.8125, + "rewards/format_reward_staging": 0.9375, + "step": 49 + }, + { + "epoch": 12.444444444444445, + "grad_norm": 1.585137209096838, + "learning_rate": 9.847001329696652e-07, + "loss": -0.0125, + "step": 50 + }, + { + "epoch": 12.444444444444445, + "eval_clip_ratio": 0.0, + "eval_completion_length": 597.925, + "eval_kl": 0.02578125, + "eval_loss": 0.024221811443567276, + "eval_reward": 2.625, + "eval_reward_std": 1.6041045665740967, + "eval_rewards/accuracy_reward_staging": 0.175, + "eval_rewards/format_reward": 0.8, + "eval_rewards/format_reward_staging": 0.95, + "eval_runtime": 51.776, + "eval_samples_per_second": 0.695, + "eval_steps_per_second": 0.097, + "step": 50 + }, + { + "clip_ratio": 0.0, + "completion_length": 567.046875, + "epoch": 12.666666666666666, + "grad_norm": 1.6497404550782266, + "kl": 0.020294189453125, + "learning_rate": 9.836688231149592e-07, + "loss": -0.0235, + "reward": 3.328125, + "reward_std": 2.148952841758728, + "rewards/accuracy_reward_staging": 0.296875, + "rewards/format_reward": 0.890625, + "rewards/format_reward_staging": 0.953125, + "step": 51 + }, + { + "clip_ratio": 0.0, + "completion_length": 655.3125, + "epoch": 12.88888888888889, + "grad_norm": 1.0110588489868237, + "kl": 0.018829345703125, + "learning_rate": 9.826044551386742e-07, + "loss": -0.0207, + "reward": 2.5625, + "reward_std": 1.046603798866272, + "rewards/accuracy_reward_staging": 0.125, + "rewards/format_reward": 0.96875, + "rewards/format_reward_staging": 0.96875, + "step": 52 + }, + { + "clip_ratio": 0.0, + "completion_length": 571.65625, + "epoch": 13.222222222222221, + "grad_norm": 1.5942717910970237, + "kl": 0.0233154296875, + "learning_rate": 9.81507101788948e-07, + "loss": 0.0327, + "reward": 2.96875, + "reward_std": 2.0815286338329315, + "rewards/accuracy_reward_staging": 0.21875, + "rewards/format_reward": 0.875, + "rewards/format_reward_staging": 1.0, + "step": 53 + }, + { + "clip_ratio": 0.0, + "completion_length": 531.0, + "epoch": 13.444444444444445, + "grad_norm": 1.6431487531106521, + "kl": 0.02325439453125, + "learning_rate": 9.803768380684242e-07, + "loss": -0.005, + "reward": 3.1875, + "reward_std": 2.4305797815322876, + "rewards/accuracy_reward_staging": 0.28125, + "rewards/format_reward": 0.875, + "rewards/format_reward_staging": 0.90625, + "step": 54 + }, + { + "clip_ratio": 0.0, + "completion_length": 530.71875, + "epoch": 13.666666666666666, + "grad_norm": 1.3532727186337274, + "kl": 0.021026611328125, + "learning_rate": 9.792137412291263e-07, + "loss": -0.0091, + "reward": 3.09375, + "reward_std": 1.5625, + "rewards/accuracy_reward_staging": 0.21875, + "rewards/format_reward": 1.0, + "rewards/format_reward_staging": 1.0, + "step": 55 + }, + { + "clip_ratio": 0.0, + "completion_length": 573.375, + "epoch": 13.88888888888889, + "grad_norm": 1.4441812945667367, + "kl": 0.024932861328125, + "learning_rate": 9.780178907671788e-07, + "loss": 0.0275, + "reward": 3.34375, + "reward_std": 2.1209341287612915, + "rewards/accuracy_reward_staging": 0.3125, + "rewards/format_reward": 0.84375, + "rewards/format_reward_staging": 0.9375, + "step": 56 + }, + { + "clip_ratio": 0.0, + "completion_length": 520.15625, + "epoch": 14.222222222222221, + "grad_norm": 1.6824005469371979, + "kl": 0.026092529296875, + "learning_rate": 9.76789368417372e-07, + "loss": -0.0531, + "reward": 2.8125, + "reward_std": 1.377088338136673, + "rewards/accuracy_reward_staging": 0.1875, + "rewards/format_reward": 0.875, + "rewards/format_reward_staging": 1.0, + "step": 57 + }, + { + "clip_ratio": 0.0, + "completion_length": 599.96875, + "epoch": 14.444444444444445, + "grad_norm": 1.4915574785365073, + "kl": 0.021026611328125, + "learning_rate": 9.755282581475767e-07, + "loss": 0.0364, + "reward": 4.9375, + "reward_std": 2.745547831058502, + "rewards/accuracy_reward_staging": 0.59375, + "rewards/format_reward": 0.96875, + "rewards/format_reward_staging": 1.0, + "step": 58 + }, + { + "clip_ratio": 0.0, + "completion_length": 549.71875, + "epoch": 14.666666666666666, + "grad_norm": 1.4821961551515155, + "kl": 0.02593994140625, + "learning_rate": 9.742346461530047e-07, + "loss": 0.0872, + "reward": 2.53125, + "reward_std": 1.4375, + "rewards/accuracy_reward_staging": 0.125, + "rewards/format_reward": 0.9375, + "rewards/format_reward_staging": 0.96875, + "step": 59 + }, + { + "clip_ratio": 0.0, + "completion_length": 601.375, + "epoch": 14.88888888888889, + "grad_norm": 1.249530356824017, + "kl": 0.023406982421875, + "learning_rate": 9.729086208503173e-07, + "loss": 0.0652, + "reward": 2.4375, + "reward_std": 1.1680222749710083, + "rewards/accuracy_reward_staging": 0.125, + "rewards/format_reward": 0.875, + "rewards/format_reward_staging": 0.9375, + "step": 60 + }, + { + "clip_ratio": 0.0, + "completion_length": 613.71875, + "epoch": 15.222222222222221, + "grad_norm": 1.4621397072761817, + "kl": 0.0252685546875, + "learning_rate": 9.715502728715825e-07, + "loss": 0.0108, + "reward": 2.96875, + "reward_std": 1.8319481909275055, + "rewards/accuracy_reward_staging": 0.21875, + "rewards/format_reward": 0.90625, + "rewards/format_reward_staging": 0.96875, + "step": 61 + }, + { + "clip_ratio": 0.0, + "completion_length": 621.71875, + "epoch": 15.444444444444445, + "grad_norm": 1.4960047973343167, + "kl": 0.023590087890625, + "learning_rate": 9.701596950580807e-07, + "loss": -0.008, + "reward": 3.21875, + "reward_std": 2.3255662322044373, + "rewards/accuracy_reward_staging": 0.28125, + "rewards/format_reward": 0.875, + "rewards/format_reward_staging": 0.9375, + "step": 62 + }, + { + "clip_ratio": 0.0, + "completion_length": 559.0, + "epoch": 15.666666666666666, + "grad_norm": 1.377229747116843, + "kl": 0.031097412109375, + "learning_rate": 9.687369824539576e-07, + "loss": 0.072, + "reward": 2.9375, + "reward_std": 1.7239685356616974, + "rewards/accuracy_reward_staging": 0.21875, + "rewards/format_reward": 0.875, + "rewards/format_reward_staging": 0.96875, + "step": 63 + }, + { + "clip_ratio": 0.0, + "completion_length": 542.125, + "epoch": 15.88888888888889, + "grad_norm": 1.3837348765591453, + "kl": 0.034423828125, + "learning_rate": 9.672822322997304e-07, + "loss": 0.0508, + "reward": 2.28125, + "reward_std": 1.1752630770206451, + "rewards/accuracy_reward_staging": 0.09375, + "rewards/format_reward": 0.90625, + "rewards/format_reward_staging": 0.90625, + "step": 64 + }, + { + "clip_ratio": 0.0, + "completion_length": 645.625, + "epoch": 16.22222222222222, + "grad_norm": 1.2294440183285422, + "kl": 0.023651123046875, + "learning_rate": 9.657955440256395e-07, + "loss": -0.0012, + "reward": 2.59375, + "reward_std": 1.0483438968658447, + "rewards/accuracy_reward_staging": 0.125, + "rewards/format_reward": 0.96875, + "rewards/format_reward_staging": 1.0, + "step": 65 + }, + { + "clip_ratio": 0.0, + "completion_length": 593.46875, + "epoch": 16.444444444444443, + "grad_norm": 1.5888259552277046, + "kl": 0.02777099609375, + "learning_rate": 9.642770192448535e-07, + "loss": 0.0496, + "reward": 3.71875, + "reward_std": 2.2672154307365417, + "rewards/accuracy_reward_staging": 0.40625, + "rewards/format_reward": 0.71875, + "rewards/format_reward_staging": 0.96875, + "step": 66 + }, + { + "clip_ratio": 0.0, + "completion_length": 574.875, + "epoch": 16.666666666666668, + "grad_norm": 1.5785381612535059, + "kl": 0.034942626953125, + "learning_rate": 9.627267617465243e-07, + "loss": -0.0426, + "reward": 3.03125, + "reward_std": 1.496883064508438, + "rewards/accuracy_reward_staging": 0.21875, + "rewards/format_reward": 0.9375, + "rewards/format_reward_staging": 1.0, + "step": 67 + }, + { + "clip_ratio": 0.0, + "completion_length": 566.4375, + "epoch": 16.88888888888889, + "grad_norm": 1.5972037559178247, + "kl": 0.026702880859375, + "learning_rate": 9.611448774886923e-07, + "loss": 0.005, + "reward": 3.15625, + "reward_std": 1.9091877937316895, + "rewards/accuracy_reward_staging": 0.25, + "rewards/format_reward": 0.9375, + "rewards/format_reward_staging": 0.96875, + "step": 68 + }, + { + "clip_ratio": 0.0, + "completion_length": 624.21875, + "epoch": 17.22222222222222, + "grad_norm": 2.781118760006495, + "kl": 0.042724609375, + "learning_rate": 9.595314745910455e-07, + "loss": 0.0926, + "reward": 3.3125, + "reward_std": 2.4584514498710632, + "rewards/accuracy_reward_staging": 0.3125, + "rewards/format_reward": 0.8125, + "rewards/format_reward_staging": 0.9375, + "step": 69 + }, + { + "clip_ratio": 0.0, + "completion_length": 545.65625, + "epoch": 17.444444444444443, + "grad_norm": 1.6678524207695304, + "kl": 0.028411865234375, + "learning_rate": 9.578866633275286e-07, + "loss": 0.0606, + "reward": 3.75, + "reward_std": 2.237764596939087, + "rewards/accuracy_reward_staging": 0.375, + "rewards/format_reward": 0.9375, + "rewards/format_reward_staging": 0.9375, + "step": 70 + }, + { + "clip_ratio": 0.0, + "completion_length": 524.0, + "epoch": 17.666666666666668, + "grad_norm": 1.4163449995088322, + "kl": 0.032470703125, + "learning_rate": 9.562105561188068e-07, + "loss": 0.0105, + "reward": 3.40625, + "reward_std": 1.9233438968658447, + "rewards/accuracy_reward_staging": 0.28125, + "rewards/format_reward": 1.0, + "rewards/format_reward_staging": 1.0, + "step": 71 + }, + { + "clip_ratio": 0.0, + "completion_length": 619.8125, + "epoch": 17.88888888888889, + "grad_norm": 1.330864720677356, + "kl": 0.02783203125, + "learning_rate": 9.545032675245813e-07, + "loss": 0.0232, + "reward": 2.875, + "reward_std": 1.5208123177289963, + "rewards/accuracy_reward_staging": 0.21875, + "rewards/format_reward": 0.875, + "rewards/format_reward_staging": 0.90625, + "step": 72 + }, + { + "clip_ratio": 0.0, + "completion_length": 615.5625, + "epoch": 18.22222222222222, + "grad_norm": 1.3940735985441313, + "kl": 0.0289306640625, + "learning_rate": 9.527649142357594e-07, + "loss": 0.0449, + "reward": 4.8125, + "reward_std": 3.365248918533325, + "rewards/accuracy_reward_staging": 0.59375, + "rewards/format_reward": 0.90625, + "rewards/format_reward_staging": 0.9375, + "step": 73 + }, + { + "clip_ratio": 0.0, + "completion_length": 560.6875, + "epoch": 18.444444444444443, + "grad_norm": 1.573293312721447, + "kl": 0.031890869140625, + "learning_rate": 9.509956150664795e-07, + "loss": 0.0727, + "reward": 2.40625, + "reward_std": 1.0983919501304626, + "rewards/accuracy_reward_staging": 0.125, + "rewards/format_reward": 0.78125, + "rewards/format_reward_staging": 1.0, + "step": 74 + }, + { + "clip_ratio": 0.0, + "completion_length": 597.71875, + "epoch": 18.666666666666668, + "grad_norm": 1.3288952801951834, + "kl": 0.028411865234375, + "learning_rate": 9.491954909459894e-07, + "loss": 0.0299, + "reward": 4.125, + "reward_std": 2.0565126538276672, + "rewards/accuracy_reward_staging": 0.4375, + "rewards/format_reward": 0.96875, + "rewards/format_reward_staging": 0.96875, + "step": 75 + }, + { + "clip_ratio": 0.0, + "completion_length": 566.875, + "epoch": 18.88888888888889, + "grad_norm": 1.600349042443185, + "kl": 0.03497314453125, + "learning_rate": 9.473646649103817e-07, + "loss": 0.0048, + "reward": 3.46875, + "reward_std": 2.3291621804237366, + "rewards/accuracy_reward_staging": 0.3125, + "rewards/format_reward": 0.9375, + "rewards/format_reward_staging": 0.96875, + "step": 76 + }, + { + "clip_ratio": 0.0, + "completion_length": 527.15625, + "epoch": 19.22222222222222, + "grad_norm": 2.0117307258354242, + "kl": 0.034820556640625, + "learning_rate": 9.455032620941839e-07, + "loss": 0.0076, + "reward": 3.15625, + "reward_std": 1.690910965204239, + "rewards/accuracy_reward_staging": 0.25, + "rewards/format_reward": 0.9375, + "rewards/format_reward_staging": 0.96875, + "step": 77 + }, + { + "clip_ratio": 0.0, + "completion_length": 548.125, + "epoch": 19.444444444444443, + "grad_norm": 1.2154400614249532, + "kl": 0.034393310546875, + "learning_rate": 9.436114097218058e-07, + "loss": 0.0153, + "reward": 2.34375, + "reward_std": 0.9375, + "rewards/accuracy_reward_staging": 0.09375, + "rewards/format_reward": 0.90625, + "rewards/format_reward_staging": 0.96875, + "step": 78 + }, + { + "clip_ratio": 0.0, + "completion_length": 599.9375, + "epoch": 19.666666666666668, + "grad_norm": 1.6406138056170174, + "kl": 0.029205322265625, + "learning_rate": 9.416892370988442e-07, + "loss": 0.0752, + "reward": 2.75, + "reward_std": 1.9128470420837402, + "rewards/accuracy_reward_staging": 0.1875, + "rewards/format_reward": 0.875, + "rewards/format_reward_staging": 0.9375, + "step": 79 + }, + { + "clip_ratio": 0.0, + "completion_length": 553.6875, + "epoch": 19.88888888888889, + "grad_norm": 1.527942838909739, + "kl": 0.030364990234375, + "learning_rate": 9.397368756032444e-07, + "loss": -0.0126, + "reward": 4.34375, + "reward_std": 3.079783648252487, + "rewards/accuracy_reward_staging": 0.5, + "rewards/format_reward": 0.875, + "rewards/format_reward_staging": 0.96875, + "step": 80 + }, + { + "clip_ratio": 0.0, + "completion_length": 561.53125, + "epoch": 20.22222222222222, + "grad_norm": 1.6342025185675375, + "kl": 0.032318115234375, + "learning_rate": 9.377544586763214e-07, + "loss": -0.0331, + "reward": 4.0625, + "reward_std": 2.1620407104492188, + "rewards/accuracy_reward_staging": 0.4375, + "rewards/format_reward": 0.90625, + "rewards/format_reward_staging": 0.96875, + "step": 81 + }, + { + "clip_ratio": 0.0, + "completion_length": 608.125, + "epoch": 20.444444444444443, + "grad_norm": 0.9760974984694354, + "kl": 0.03082275390625, + "learning_rate": 9.357421218136386e-07, + "loss": -0.0281, + "reward": 2.90625, + "reward_std": 1.3726893961429596, + "rewards/accuracy_reward_staging": 0.1875, + "rewards/format_reward": 0.96875, + "rewards/format_reward_staging": 1.0, + "step": 82 + }, + { + "clip_ratio": 0.0, + "completion_length": 607.1875, + "epoch": 20.666666666666668, + "grad_norm": 2.9650567483991894, + "kl": 0.0552978515625, + "learning_rate": 9.337000025557476e-07, + "loss": 0.0494, + "reward": 2.78125, + "reward_std": 1.907078742980957, + "rewards/accuracy_reward_staging": 0.21875, + "rewards/format_reward": 0.8125, + "rewards/format_reward_staging": 0.875, + "step": 83 + }, + { + "clip_ratio": 0.0, + "completion_length": 648.5625, + "epoch": 20.88888888888889, + "grad_norm": 1.4868637308996282, + "kl": 0.04937744140625, + "learning_rate": 9.316282404787869e-07, + "loss": 0.0813, + "reward": 2.4375, + "reward_std": 1.534547746181488, + "rewards/accuracy_reward_staging": 0.15625, + "rewards/format_reward": 0.78125, + "rewards/format_reward_staging": 0.875, + "step": 84 + }, + { + "clip_ratio": 0.0, + "completion_length": 598.0, + "epoch": 21.22222222222222, + "grad_norm": 1.5354307812124717, + "kl": 0.03326416015625, + "learning_rate": 9.295269771849425e-07, + "loss": 0.1102, + "reward": 3.53125, + "reward_std": 2.2993226647377014, + "rewards/accuracy_reward_staging": 0.34375, + "rewards/format_reward": 0.875, + "rewards/format_reward_staging": 0.9375, + "step": 85 + }, + { + "clip_ratio": 0.0, + "completion_length": 585.25, + "epoch": 21.444444444444443, + "grad_norm": 1.21306534102283, + "kl": 0.03662109375, + "learning_rate": 9.273963562927694e-07, + "loss": 0.0034, + "reward": 2.90625, + "reward_std": 1.0483438968658447, + "rewards/accuracy_reward_staging": 0.1875, + "rewards/format_reward": 0.96875, + "rewards/format_reward_staging": 1.0, + "step": 86 + }, + { + "clip_ratio": 0.0, + "completion_length": 568.71875, + "epoch": 21.666666666666668, + "grad_norm": 11.152096799903676, + "kl": 0.09722900390625, + "learning_rate": 9.252365234273753e-07, + "loss": 0.0125, + "reward": 3.1875, + "reward_std": 1.9283326417207718, + "rewards/accuracy_reward_staging": 0.28125, + "rewards/format_reward": 0.8125, + "rewards/format_reward_staging": 0.96875, + "step": 87 + }, + { + "clip_ratio": 0.0, + "completion_length": 567.0625, + "epoch": 21.88888888888889, + "grad_norm": 1.4820021533223564, + "kl": 0.04046630859375, + "learning_rate": 9.230476262104676e-07, + "loss": 0.0631, + "reward": 3.40625, + "reward_std": 2.233847141265869, + "rewards/accuracy_reward_staging": 0.3125, + "rewards/format_reward": 0.84375, + "rewards/format_reward_staging": 1.0, + "step": 88 + }, + { + "clip_ratio": 0.0, + "completion_length": 571.625, + "epoch": 22.22222222222222, + "grad_norm": 1.6685149630374954, + "kl": 0.04840087890625, + "learning_rate": 9.208298142502635e-07, + "loss": 0.057, + "reward": 2.90625, + "reward_std": 1.7658206820487976, + "rewards/accuracy_reward_staging": 0.21875, + "rewards/format_reward": 0.84375, + "rewards/format_reward_staging": 0.96875, + "step": 89 + }, + { + "clip_ratio": 0.0, + "completion_length": 652.0, + "epoch": 22.444444444444443, + "grad_norm": 1.2598669750057847, + "kl": 0.037872314453125, + "learning_rate": 9.185832391312642e-07, + "loss": 0.0397, + "reward": 2.1875, + "reward_std": 1.0936830341815948, + "rewards/accuracy_reward_staging": 0.09375, + "rewards/format_reward": 0.875, + "rewards/format_reward_staging": 0.84375, + "step": 90 + }, + { + "clip_ratio": 0.0, + "completion_length": 512.84375, + "epoch": 22.666666666666668, + "grad_norm": 1.464116156191612, + "kl": 0.0408935546875, + "learning_rate": 9.163080544038952e-07, + "loss": 0.0325, + "reward": 3.1875, + "reward_std": 2.0054054260253906, + "rewards/accuracy_reward_staging": 0.28125, + "rewards/format_reward": 0.84375, + "rewards/format_reward_staging": 0.9375, + "step": 91 + }, + { + "clip_ratio": 0.0, + "completion_length": 595.0, + "epoch": 22.88888888888889, + "grad_norm": 1.7502771652549964, + "kl": 0.0543212890625, + "learning_rate": 9.1400441557401e-07, + "loss": 0.1198, + "reward": 4.375, + "reward_std": 2.6551371216773987, + "rewards/accuracy_reward_staging": 0.53125, + "rewards/format_reward": 0.8125, + "rewards/format_reward_staging": 0.90625, + "step": 92 + }, + { + "clip_ratio": 0.0, + "completion_length": 575.375, + "epoch": 23.22222222222222, + "grad_norm": 1.5494132503619473, + "kl": 0.04376220703125, + "learning_rate": 9.116724800922629e-07, + "loss": 0.1098, + "reward": 3.6875, + "reward_std": 1.9493454694747925, + "rewards/accuracy_reward_staging": 0.375, + "rewards/format_reward": 0.875, + "rewards/format_reward_staging": 0.9375, + "step": 93 + }, + { + "clip_ratio": 0.0, + "completion_length": 577.5, + "epoch": 23.444444444444443, + "grad_norm": 1.2511045169588764, + "kl": 0.0521240234375, + "learning_rate": 9.093124073433462e-07, + "loss": 0.0389, + "reward": 3.5625, + "reward_std": 2.1182020902633667, + "rewards/accuracy_reward_staging": 0.34375, + "rewards/format_reward": 0.84375, + "rewards/format_reward_staging": 1.0, + "step": 94 + }, + { + "clip_ratio": 0.0, + "completion_length": 590.78125, + "epoch": 23.666666666666668, + "grad_norm": 1.5974928179741261, + "kl": 0.045074462890625, + "learning_rate": 9.069243586350975e-07, + "loss": -0.0127, + "reward": 4.09375, + "reward_std": 2.1429253816604614, + "rewards/accuracy_reward_staging": 0.4375, + "rewards/format_reward": 0.90625, + "rewards/format_reward_staging": 1.0, + "step": 95 + }, + { + "clip_ratio": 0.0, + "completion_length": 454.1875, + "epoch": 23.88888888888889, + "grad_norm": 1.885519118261372, + "kl": 0.0450439453125, + "learning_rate": 9.045084971874737e-07, + "loss": 0.0469, + "reward": 4.0625, + "reward_std": 2.76924729347229, + "rewards/accuracy_reward_staging": 0.4375, + "rewards/format_reward": 0.90625, + "rewards/format_reward_staging": 0.96875, + "step": 96 + }, + { + "clip_ratio": 0.0, + "completion_length": 538.0625, + "epoch": 24.22222222222222, + "grad_norm": 1.5682355592026038, + "kl": 0.05267333984375, + "learning_rate": 9.020649881213958e-07, + "loss": 0.0061, + "reward": 3.40625, + "reward_std": 2.1967990398406982, + "rewards/accuracy_reward_staging": 0.3125, + "rewards/format_reward": 0.90625, + "rewards/format_reward_staging": 0.9375, + "step": 97 + }, + { + "clip_ratio": 0.0, + "completion_length": 632.75, + "epoch": 24.444444444444443, + "grad_norm": 1.2736403946455588, + "kl": 0.044189453125, + "learning_rate": 8.995939984474623e-07, + "loss": 0.0172, + "reward": 3.84375, + "reward_std": 2.4564297795295715, + "rewards/accuracy_reward_staging": 0.40625, + "rewards/format_reward": 0.8125, + "rewards/format_reward_staging": 1.0, + "step": 98 + }, + { + "clip_ratio": 0.0, + "completion_length": 582.90625, + "epoch": 24.666666666666668, + "grad_norm": 1.5123549273068009, + "kl": 0.04638671875, + "learning_rate": 8.970956970545355e-07, + "loss": 0.0662, + "reward": 3.78125, + "reward_std": 2.7111909985542297, + "rewards/accuracy_reward_staging": 0.375, + "rewards/format_reward": 0.9375, + "rewards/format_reward_staging": 0.96875, + "step": 99 + }, + { + "epoch": 24.88888888888889, + "grad_norm": 1.7849471498217702, + "learning_rate": 8.945702546981968e-07, + "loss": 0.142, + "step": 100 + }, + { + "epoch": 24.88888888888889, + "eval_clip_ratio": 0.0, + "eval_completion_length": 511.125, + "eval_kl": 0.073193359375, + "eval_loss": -0.007530718110501766, + "eval_reward": 2.075, + "eval_reward_std": 0.6665439963340759, + "eval_rewards/accuracy_reward_staging": 0.05, + "eval_rewards/format_reward": 0.85, + "eval_rewards/format_reward_staging": 0.975, + "eval_runtime": 50.3514, + "eval_samples_per_second": 0.715, + "eval_steps_per_second": 0.099, + "step": 100 + }, + { + "clip_ratio": 0.0, + "completion_length": 581.21875, + "epoch": 25.22222222222222, + "grad_norm": 1.8652012487452174, + "kl": 0.05792236328125, + "learning_rate": 8.920178439890764e-07, + "loss": 0.0112, + "reward": 3.46875, + "reward_std": 1.8295301795005798, + "rewards/accuracy_reward_staging": 0.328125, + "rewards/format_reward": 0.890625, + "rewards/format_reward_staging": 0.9375, + "step": 101 + }, + { + "clip_ratio": 0.0, + "completion_length": 597.875, + "epoch": 25.444444444444443, + "grad_norm": 4.84795988570716, + "kl": 0.06231689453125, + "learning_rate": 8.894386393810562e-07, + "loss": 0.0844, + "reward": 2.875, + "reward_std": 1.6470783054828644, + "rewards/accuracy_reward_staging": 0.21875, + "rewards/format_reward": 0.8125, + "rewards/format_reward_staging": 0.96875, + "step": 102 + }, + { + "clip_ratio": 0.0, + "completion_length": 562.5, + "epoch": 25.666666666666668, + "grad_norm": 1.8754521848828094, + "kl": 0.052978515625, + "learning_rate": 8.868328171593446e-07, + "loss": -0.0154, + "reward": 4.25, + "reward_std": 2.547704756259918, + "rewards/accuracy_reward_staging": 0.46875, + "rewards/format_reward": 0.90625, + "rewards/format_reward_staging": 1.0, + "step": 103 + }, + { + "clip_ratio": 0.0, + "completion_length": 537.3125, + "epoch": 25.88888888888889, + "grad_norm": 1.797756724546587, + "kl": 0.05206298828125, + "learning_rate": 8.842005554284295e-07, + "loss": -0.0275, + "reward": 3.84375, + "reward_std": 2.51630362868309, + "rewards/accuracy_reward_staging": 0.40625, + "rewards/format_reward": 0.90625, + "rewards/format_reward_staging": 0.90625, + "step": 104 + }, + { + "clip_ratio": 0.0, + "completion_length": 622.53125, + "epoch": 26.22222222222222, + "grad_norm": 1.300955660750681, + "kl": 0.05413818359375, + "learning_rate": 8.815420340999033e-07, + "loss": 0.0637, + "reward": 3.84375, + "reward_std": 1.3620327413082123, + "rewards/accuracy_reward_staging": 0.40625, + "rewards/format_reward": 0.875, + "rewards/format_reward_staging": 0.9375, + "step": 105 + }, + { + "clip_ratio": 0.0, + "completion_length": 594.25, + "epoch": 26.444444444444443, + "grad_norm": 1.43085940167237, + "kl": 0.0439453125, + "learning_rate": 8.788574348801674e-07, + "loss": 0.0768, + "reward": 4.625, + "reward_std": 1.9858438968658447, + "rewards/accuracy_reward_staging": 0.53125, + "rewards/format_reward": 0.96875, + "rewards/format_reward_staging": 1.0, + "step": 106 + }, + { + "clip_ratio": 0.0, + "completion_length": 582.15625, + "epoch": 26.666666666666668, + "grad_norm": 1.6135777138066925, + "kl": 0.06390380859375, + "learning_rate": 8.761469412580124e-07, + "loss": 0.0142, + "reward": 1.96875, + "reward_std": 1.00966876745224, + "rewards/accuracy_reward_staging": 0.0625, + "rewards/format_reward": 0.78125, + "rewards/format_reward_staging": 0.875, + "step": 107 + }, + { + "clip_ratio": 0.0, + "completion_length": 587.0625, + "epoch": 26.88888888888889, + "grad_norm": 2.1207393888337887, + "kl": 0.06134033203125, + "learning_rate": 8.734107384920769e-07, + "loss": 0.0242, + "reward": 4.125, + "reward_std": 3.0213340520858765, + "rewards/accuracy_reward_staging": 0.4375, + "rewards/format_reward": 0.9375, + "rewards/format_reward_staging": 1.0, + "step": 108 + }, + { + "clip_ratio": 0.0, + "completion_length": 548.40625, + "epoch": 27.22222222222222, + "grad_norm": 1.5591626897717148, + "kl": 0.0465087890625, + "learning_rate": 8.706490135981855e-07, + "loss": -0.0282, + "reward": 4.5625, + "reward_std": 2.360237419605255, + "rewards/accuracy_reward_staging": 0.53125, + "rewards/format_reward": 0.9375, + "rewards/format_reward_staging": 0.96875, + "step": 109 + }, + { + "clip_ratio": 0.0, + "completion_length": 598.8125, + "epoch": 27.444444444444443, + "grad_norm": 1.1645802297935302, + "kl": 0.04632568359375, + "learning_rate": 8.678619553365658e-07, + "loss": -0.0278, + "reward": 3.21875, + "reward_std": 1.8432062864303589, + "rewards/accuracy_reward_staging": 0.25, + "rewards/format_reward": 0.96875, + "rewards/format_reward_staging": 1.0, + "step": 110 + }, + { + "clip_ratio": 0.0, + "completion_length": 610.21875, + "epoch": 27.666666666666668, + "grad_norm": 1.7617563181087859, + "kl": 0.0550537109375, + "learning_rate": 8.650497541989481e-07, + "loss": -0.0219, + "reward": 2.84375, + "reward_std": 1.7233919501304626, + "rewards/accuracy_reward_staging": 0.1875, + "rewards/format_reward": 0.9375, + "rewards/format_reward_staging": 0.96875, + "step": 111 + }, + { + "clip_ratio": 0.0, + "completion_length": 602.5, + "epoch": 27.88888888888889, + "grad_norm": 1.4790592908519822, + "kl": 0.04412841796875, + "learning_rate": 8.622126023955445e-07, + "loss": 0.0624, + "reward": 3.65625, + "reward_std": 2.0483438968658447, + "rewards/accuracy_reward_staging": 0.34375, + "rewards/format_reward": 0.96875, + "rewards/format_reward_staging": 0.96875, + "step": 112 + }, + { + "clip_ratio": 0.0, + "completion_length": 602.21875, + "epoch": 28.22222222222222, + "grad_norm": 1.4599487812585739, + "kl": 0.0484619140625, + "learning_rate": 8.593506938419119e-07, + "loss": 0.0459, + "reward": 3.84375, + "reward_std": 0.9925079494714737, + "rewards/accuracy_reward_staging": 0.40625, + "rewards/format_reward": 0.84375, + "rewards/format_reward_staging": 0.96875, + "step": 113 + }, + { + "clip_ratio": 0.0, + "completion_length": 569.71875, + "epoch": 28.444444444444443, + "grad_norm": 1.3832144010184737, + "kl": 0.0467529296875, + "learning_rate": 8.564642241456986e-07, + "loss": 0.0025, + "reward": 3.71875, + "reward_std": 1.9233438968658447, + "rewards/accuracy_reward_staging": 0.34375, + "rewards/format_reward": 1.0, + "rewards/format_reward_staging": 1.0, + "step": 114 + }, + { + "clip_ratio": 0.0, + "completion_length": 548.78125, + "epoch": 28.666666666666668, + "grad_norm": 1.803131512178923, + "kl": 0.0576171875, + "learning_rate": 8.535533905932737e-07, + "loss": -0.0187, + "reward": 3.90625, + "reward_std": 2.563826858997345, + "rewards/accuracy_reward_staging": 0.4375, + "rewards/format_reward": 0.75, + "rewards/format_reward_staging": 0.96875, + "step": 115 + }, + { + "clip_ratio": 0.0, + "completion_length": 564.4375, + "epoch": 28.88888888888889, + "grad_norm": 1.6402368950584498, + "kl": 0.05029296875, + "learning_rate": 8.506183921362442e-07, + "loss": -0.0174, + "reward": 3.3125, + "reward_std": 2.5466037690639496, + "rewards/accuracy_reward_staging": 0.28125, + "rewards/format_reward": 0.9375, + "rewards/format_reward_staging": 0.96875, + "step": 116 + }, + { + "clip_ratio": 0.0, + "completion_length": 563.5625, + "epoch": 29.22222222222222, + "grad_norm": 1.4985773194128882, + "kl": 0.04840087890625, + "learning_rate": 8.47659429377856e-07, + "loss": -0.0153, + "reward": 3.875, + "reward_std": 2.3320942521095276, + "rewards/accuracy_reward_staging": 0.375, + "rewards/format_reward": 1.0, + "rewards/format_reward_staging": 1.0, + "step": 117 + }, + { + "clip_ratio": 0.0, + "completion_length": 565.34375, + "epoch": 29.444444444444443, + "grad_norm": 1.7501844147476033, + "kl": 0.05194091796875, + "learning_rate": 8.446767045592829e-07, + "loss": 0.0359, + "reward": 3.84375, + "reward_std": 2.3963494896888733, + "rewards/accuracy_reward_staging": 0.40625, + "rewards/format_reward": 0.84375, + "rewards/format_reward_staging": 0.96875, + "step": 118 + }, + { + "clip_ratio": 0.0, + "completion_length": 574.5, + "epoch": 29.666666666666668, + "grad_norm": 1.2571401212163673, + "kl": 0.0498046875, + "learning_rate": 8.416704215458042e-07, + "loss": 0.0187, + "reward": 3.3125, + "reward_std": 1.125, + "rewards/accuracy_reward_staging": 0.28125, + "rewards/format_reward": 0.96875, + "rewards/format_reward_staging": 0.9375, + "step": 119 + }, + { + "clip_ratio": 0.0, + "completion_length": 597.25, + "epoch": 29.88888888888889, + "grad_norm": 1.2235795288016953, + "kl": 0.04754638671875, + "learning_rate": 8.386407858128706e-07, + "loss": -0.0144, + "reward": 3.25, + "reward_std": 1.5358919501304626, + "rewards/accuracy_reward_staging": 0.25, + "rewards/format_reward": 1.0, + "rewards/format_reward_staging": 1.0, + "step": 120 + }, + { + "clip_ratio": 0.0, + "completion_length": 597.15625, + "epoch": 30.22222222222222, + "grad_norm": 1.6274382257749778, + "kl": 0.060791015625, + "learning_rate": 8.355880044320597e-07, + "loss": 0.0121, + "reward": 3.34375, + "reward_std": 2.7569093704223633, + "rewards/accuracy_reward_staging": 0.34375, + "rewards/format_reward": 0.78125, + "rewards/format_reward_staging": 0.84375, + "step": 121 + }, + { + "clip_ratio": 0.0, + "completion_length": 592.8125, + "epoch": 30.444444444444443, + "grad_norm": 2.5186927220968895, + "kl": 0.09588623046875, + "learning_rate": 8.325122860569241e-07, + "loss": 0.0081, + "reward": 3.15625, + "reward_std": 2.1270195841789246, + "rewards/accuracy_reward_staging": 0.28125, + "rewards/format_reward": 0.875, + "rewards/format_reward_staging": 0.875, + "step": 122 + }, + { + "clip_ratio": 0.0, + "completion_length": 564.1875, + "epoch": 30.666666666666668, + "grad_norm": 1.4932442148368137, + "kl": 0.04656982421875, + "learning_rate": 8.294138409087289e-07, + "loss": 0.0298, + "reward": 3.625, + "reward_std": 2.008278489112854, + "rewards/accuracy_reward_staging": 0.34375, + "rewards/format_reward": 0.90625, + "rewards/format_reward_staging": 1.0, + "step": 123 + }, + { + "clip_ratio": 0.0, + "completion_length": 607.375, + "epoch": 30.88888888888889, + "grad_norm": 3.4718877576698746, + "kl": 0.076904296875, + "learning_rate": 8.262928807620843e-07, + "loss": -0.0234, + "reward": 3.6875, + "reward_std": 2.751339912414551, + "rewards/accuracy_reward_staging": 0.375, + "rewards/format_reward": 0.90625, + "rewards/format_reward_staging": 0.90625, + "step": 124 + }, + { + "clip_ratio": 0.0, + "completion_length": 531.5625, + "epoch": 31.22222222222222, + "grad_norm": 1.622119125741056, + "kl": 0.05914306640625, + "learning_rate": 8.231496189304704e-07, + "loss": 0.0119, + "reward": 3.78125, + "reward_std": 1.9775724411010742, + "rewards/accuracy_reward_staging": 0.375, + "rewards/format_reward": 0.90625, + "rewards/format_reward_staging": 1.0, + "step": 125 + }, + { + "clip_ratio": 0.0, + "completion_length": 645.84375, + "epoch": 31.444444444444443, + "grad_norm": 1.6061164218143151, + "kl": 0.0496826171875, + "learning_rate": 8.199842702516582e-07, + "loss": 0.0355, + "reward": 3.90625, + "reward_std": 2.5803541243076324, + "rewards/accuracy_reward_staging": 0.4375, + "rewards/format_reward": 0.78125, + "rewards/format_reward_staging": 0.9375, + "step": 126 + }, + { + "clip_ratio": 0.0, + "completion_length": 597.3125, + "epoch": 31.666666666666668, + "grad_norm": 1.3457598005679037, + "kl": 0.0526123046875, + "learning_rate": 8.167970510730252e-07, + "loss": -0.0134, + "reward": 3.15625, + "reward_std": 1.8007422089576721, + "rewards/accuracy_reward_staging": 0.25, + "rewards/format_reward": 0.9375, + "rewards/format_reward_staging": 0.96875, + "step": 127 + }, + { + "clip_ratio": 0.0, + "completion_length": 640.25, + "epoch": 31.88888888888889, + "grad_norm": 1.5569181185599603, + "kl": 0.058349609375, + "learning_rate": 8.135881792367685e-07, + "loss": -0.0192, + "reward": 3.59375, + "reward_std": 1.5271694660186768, + "rewards/accuracy_reward_staging": 0.34375, + "rewards/format_reward": 0.875, + "rewards/format_reward_staging": 1.0, + "step": 128 + }, + { + "clip_ratio": 0.0, + "completion_length": 574.34375, + "epoch": 32.22222222222222, + "grad_norm": 1.6790409041925978, + "kl": 0.05426025390625, + "learning_rate": 8.103578740650156e-07, + "loss": -0.0013, + "reward": 3.8125, + "reward_std": 2.151860535144806, + "rewards/accuracy_reward_staging": 0.375, + "rewards/format_reward": 0.9375, + "rewards/format_reward_staging": 1.0, + "step": 129 + }, + { + "clip_ratio": 0.0, + "completion_length": 568.0, + "epoch": 32.44444444444444, + "grad_norm": 1.7164186713447234, + "kl": 0.0628662109375, + "learning_rate": 8.071063563448339e-07, + "loss": 0.0355, + "reward": 3.09375, + "reward_std": 2.110320746898651, + "rewards/accuracy_reward_staging": 0.28125, + "rewards/format_reward": 0.78125, + "rewards/format_reward_staging": 0.90625, + "step": 130 + }, + { + "clip_ratio": 0.0, + "completion_length": 612.125, + "epoch": 32.666666666666664, + "grad_norm": 1.4484213626473657, + "kl": 0.0438232421875, + "learning_rate": 8.038338483131406e-07, + "loss": 0.0675, + "reward": 2.65625, + "reward_std": 1.5483438968658447, + "rewards/accuracy_reward_staging": 0.15625, + "rewards/format_reward": 0.90625, + "rewards/format_reward_staging": 0.96875, + "step": 131 + }, + { + "clip_ratio": 0.0, + "completion_length": 633.375, + "epoch": 32.888888888888886, + "grad_norm": 1.4888928164051263, + "kl": 0.046630859375, + "learning_rate": 8.005405736415125e-07, + "loss": 0.003, + "reward": 3.5625, + "reward_std": 2.257579743862152, + "rewards/accuracy_reward_staging": 0.3125, + "rewards/format_reward": 1.0, + "rewards/format_reward_staging": 1.0, + "step": 132 + }, + { + "clip_ratio": 0.0, + "completion_length": 608.28125, + "epoch": 33.22222222222222, + "grad_norm": 1.4537634594396451, + "kl": 0.05352783203125, + "learning_rate": 7.97226757420899e-07, + "loss": 0.0072, + "reward": 4.53125, + "reward_std": 2.650395154953003, + "rewards/accuracy_reward_staging": 0.53125, + "rewards/format_reward": 0.90625, + "rewards/format_reward_staging": 0.96875, + "step": 133 + }, + { + "clip_ratio": 0.0, + "completion_length": 589.625, + "epoch": 33.44444444444444, + "grad_norm": 5.103167634384414, + "kl": 0.107421875, + "learning_rate": 7.938926261462365e-07, + "loss": 0.0303, + "reward": 3.96875, + "reward_std": 1.4233438968658447, + "rewards/accuracy_reward_staging": 0.40625, + "rewards/format_reward": 0.9375, + "rewards/format_reward_staging": 1.0, + "step": 134 + }, + { + "clip_ratio": 0.0, + "completion_length": 576.90625, + "epoch": 33.666666666666664, + "grad_norm": 5.13739196509469, + "kl": 0.09185791015625, + "learning_rate": 7.905384077009692e-07, + "loss": 0.0254, + "reward": 3.40625, + "reward_std": 2.5271694660186768, + "rewards/accuracy_reward_staging": 0.3125, + "rewards/format_reward": 0.875, + "rewards/format_reward_staging": 0.96875, + "step": 135 + }, + { + "clip_ratio": 0.0, + "completion_length": 565.0, + "epoch": 33.888888888888886, + "grad_norm": 1.3347218031999781, + "kl": 0.05279541015625, + "learning_rate": 7.871643313414718e-07, + "loss": -0.0269, + "reward": 3.78125, + "reward_std": 1.9108592867851257, + "rewards/accuracy_reward_staging": 0.375, + "rewards/format_reward": 0.9375, + "rewards/format_reward_staging": 0.96875, + "step": 136 + }, + { + "clip_ratio": 0.0, + "completion_length": 596.96875, + "epoch": 34.22222222222222, + "grad_norm": 1.6203773256898213, + "kl": 0.05377197265625, + "learning_rate": 7.837706276813818e-07, + "loss": -0.0507, + "reward": 3.78125, + "reward_std": 2.8475868701934814, + "rewards/accuracy_reward_staging": 0.40625, + "rewards/format_reward": 0.84375, + "rewards/format_reward_staging": 0.90625, + "step": 137 + }, + { + "clip_ratio": 0.0, + "completion_length": 548.90625, + "epoch": 34.44444444444444, + "grad_norm": 1.7589228637659193, + "kl": 0.0518798828125, + "learning_rate": 7.803575286758363e-07, + "loss": 0.0256, + "reward": 3.84375, + "reward_std": 2.3770764470100403, + "rewards/accuracy_reward_staging": 0.40625, + "rewards/format_reward": 0.875, + "rewards/format_reward_staging": 0.9375, + "step": 138 + }, + { + "clip_ratio": 0.0, + "completion_length": 574.90625, + "epoch": 34.666666666666664, + "grad_norm": 1.465848261824115, + "kl": 0.05047607421875, + "learning_rate": 7.769252676056186e-07, + "loss": 0.0121, + "reward": 3.0, + "reward_std": 1.999484658241272, + "rewards/accuracy_reward_staging": 0.21875, + "rewards/format_reward": 0.9375, + "rewards/format_reward_staging": 0.96875, + "step": 139 + }, + { + "clip_ratio": 0.0, + "completion_length": 621.5625, + "epoch": 34.888888888888886, + "grad_norm": 1.699502675045011, + "kl": 0.04669189453125, + "learning_rate": 7.734740790612136e-07, + "loss": -0.0043, + "reward": 3.65625, + "reward_std": 2.740947127342224, + "rewards/accuracy_reward_staging": 0.40625, + "rewards/format_reward": 0.78125, + "rewards/format_reward_staging": 0.84375, + "step": 140 + }, + { + "clip_ratio": 0.0, + "completion_length": 575.03125, + "epoch": 35.22222222222222, + "grad_norm": 1.4180294898308454, + "kl": 0.04791259765625, + "learning_rate": 7.700041989267736e-07, + "loss": 0.0128, + "reward": 3.9375, + "reward_std": 1.6851893663406372, + "rewards/accuracy_reward_staging": 0.40625, + "rewards/format_reward": 0.90625, + "rewards/format_reward_staging": 1.0, + "step": 141 + }, + { + "clip_ratio": 0.0, + "completion_length": 634.3125, + "epoch": 35.44444444444444, + "grad_norm": 0.97669552258444, + "kl": 0.04840087890625, + "learning_rate": 7.665158643639969e-07, + "loss": 0.0078, + "reward": 3.90625, + "reward_std": 1.2753951847553253, + "rewards/accuracy_reward_staging": 0.40625, + "rewards/format_reward": 0.875, + "rewards/format_reward_staging": 1.0, + "step": 142 + }, + { + "clip_ratio": 0.0, + "completion_length": 564.9375, + "epoch": 35.666666666666664, + "grad_norm": 1.4705421421024347, + "kl": 0.0458984375, + "learning_rate": 7.63009313795917e-07, + "loss": 0.0007, + "reward": 3.375, + "reward_std": 1.9858438968658447, + "rewards/accuracy_reward_staging": 0.28125, + "rewards/format_reward": 0.96875, + "rewards/format_reward_staging": 1.0, + "step": 143 + }, + { + "clip_ratio": 0.0, + "completion_length": 630.8125, + "epoch": 35.888888888888886, + "grad_norm": 1.4040857696410018, + "kl": 0.0491943359375, + "learning_rate": 7.594847868906076e-07, + "loss": 0.0157, + "reward": 4.53125, + "reward_std": 1.881795346736908, + "rewards/accuracy_reward_staging": 0.5625, + "rewards/format_reward": 0.78125, + "rewards/format_reward_staging": 0.9375, + "step": 144 + }, + { + "clip_ratio": 0.0, + "completion_length": 597.4375, + "epoch": 36.22222222222222, + "grad_norm": 1.7416315495447303, + "kl": 0.05291748046875, + "learning_rate": 7.559425245448005e-07, + "loss": 0.1534, + "reward": 4.125, + "reward_std": 1.7268692255020142, + "rewards/accuracy_reward_staging": 0.46875, + "rewards/format_reward": 0.875, + "rewards/format_reward_staging": 0.90625, + "step": 145 + }, + { + "clip_ratio": 0.0, + "completion_length": 543.34375, + "epoch": 36.44444444444444, + "grad_norm": 1.3338618690781434, + "kl": 0.05255126953125, + "learning_rate": 7.523827688674219e-07, + "loss": 0.0048, + "reward": 3.46875, + "reward_std": 1.7618454992771149, + "rewards/accuracy_reward_staging": 0.3125, + "rewards/format_reward": 0.9375, + "rewards/format_reward_staging": 0.96875, + "step": 146 + }, + { + "clip_ratio": 0.0, + "completion_length": 579.625, + "epoch": 36.666666666666664, + "grad_norm": 1.8245501253344487, + "kl": 0.04931640625, + "learning_rate": 7.488057631630437e-07, + "loss": 0.0975, + "reward": 3.78125, + "reward_std": 2.0842358469963074, + "rewards/accuracy_reward_staging": 0.375, + "rewards/format_reward": 0.90625, + "rewards/format_reward_staging": 1.0, + "step": 147 + }, + { + "clip_ratio": 0.0, + "completion_length": 556.25, + "epoch": 36.888888888888886, + "grad_norm": 1.60039839729214, + "kl": 0.0479736328125, + "learning_rate": 7.452117519152541e-07, + "loss": -0.0225, + "reward": 4.75, + "reward_std": 2.8358521461486816, + "rewards/accuracy_reward_staging": 0.59375, + "rewards/format_reward": 0.84375, + "rewards/format_reward_staging": 0.9375, + "step": 148 + }, + { + "clip_ratio": 0.0, + "completion_length": 575.59375, + "epoch": 37.22222222222222, + "grad_norm": 1.9262708460562594, + "kl": 0.04852294921875, + "learning_rate": 7.416009807699481e-07, + "loss": 0.0694, + "reward": 3.875, + "reward_std": 2.4488722383975983, + "rewards/accuracy_reward_staging": 0.40625, + "rewards/format_reward": 0.90625, + "rewards/format_reward_staging": 0.9375, + "step": 149 + }, + { + "epoch": 37.44444444444444, + "grad_norm": 1.4126152849193538, + "learning_rate": 7.379736965185368e-07, + "loss": 0.0461, + "step": 150 + }, + { + "epoch": 37.44444444444444, + "eval_clip_ratio": 0.0, + "eval_completion_length": 583.05, + "eval_kl": 0.045751953125, + "eval_loss": -0.002990193199366331, + "eval_reward": 2.725, + "eval_reward_std": 1.3047046661376953, + "eval_rewards/accuracy_reward_staging": 0.175, + "eval_rewards/format_reward": 0.875, + "eval_rewards/format_reward_staging": 0.975, + "eval_runtime": 52.1348, + "eval_samples_per_second": 0.691, + "eval_steps_per_second": 0.096, + "step": 150 + }, + { + "clip_ratio": 0.0, + "completion_length": 566.234375, + "epoch": 37.666666666666664, + "grad_norm": 1.6128745248404066, + "kl": 0.0498046875, + "learning_rate": 7.343301470810807e-07, + "loss": 0.0205, + "reward": 3.8125, + "reward_std": 2.2092738151550293, + "rewards/accuracy_reward_staging": 0.375, + "rewards/format_reward": 0.953125, + "rewards/format_reward_staging": 0.984375, + "step": 151 + }, + { + "clip_ratio": 0.0, + "completion_length": 561.6875, + "epoch": 37.888888888888886, + "grad_norm": 1.6630192300364095, + "kl": 0.051513671875, + "learning_rate": 7.306705814893439e-07, + "loss": 0.0613, + "reward": 4.75, + "reward_std": 3.510585069656372, + "rewards/accuracy_reward_staging": 0.5625, + "rewards/format_reward": 0.96875, + "rewards/format_reward_staging": 0.96875, + "step": 152 + }, + { + "clip_ratio": 0.0, + "completion_length": 587.40625, + "epoch": 38.22222222222222, + "grad_norm": 1.5821951322432892, + "kl": 0.0535888671875, + "learning_rate": 7.269952498697734e-07, + "loss": 0.0053, + "reward": 3.78125, + "reward_std": 2.4141127467155457, + "rewards/accuracy_reward_staging": 0.4375, + "rewards/format_reward": 0.75, + "rewards/format_reward_staging": 0.84375, + "step": 153 + }, + { + "clip_ratio": 0.0, + "completion_length": 572.6875, + "epoch": 38.44444444444444, + "grad_norm": 2.2424195208633453, + "kl": 0.07366943359375, + "learning_rate": 7.233044034264033e-07, + "loss": 0.0315, + "reward": 3.84375, + "reward_std": 2.3134855031967163, + "rewards/accuracy_reward_staging": 0.4375, + "rewards/format_reward": 0.75, + "rewards/format_reward_staging": 0.90625, + "step": 154 + }, + { + "clip_ratio": 0.0, + "completion_length": 555.9375, + "epoch": 38.666666666666664, + "grad_norm": 1.529067187866317, + "kl": 0.05157470703125, + "learning_rate": 7.195982944236852e-07, + "loss": 0.0321, + "reward": 2.8125, + "reward_std": 1.796603798866272, + "rewards/accuracy_reward_staging": 0.1875, + "rewards/format_reward": 0.9375, + "rewards/format_reward_staging": 0.9375, + "step": 155 + }, + { + "clip_ratio": 0.0, + "completion_length": 630.0, + "epoch": 38.888888888888886, + "grad_norm": 1.579548286063579, + "kl": 0.050537109375, + "learning_rate": 7.158771761692464e-07, + "loss": 0.0309, + "reward": 4.28125, + "reward_std": 2.8335397839546204, + "rewards/accuracy_reward_staging": 0.5, + "rewards/format_reward": 0.875, + "rewards/format_reward_staging": 0.90625, + "step": 156 + }, + { + "clip_ratio": 0.0, + "completion_length": 563.0625, + "epoch": 39.22222222222222, + "grad_norm": 1.4963033786464435, + "kl": 0.050048828125, + "learning_rate": 7.121413029965769e-07, + "loss": 0.0482, + "reward": 3.8125, + "reward_std": 2.3843142986297607, + "rewards/accuracy_reward_staging": 0.40625, + "rewards/format_reward": 0.84375, + "rewards/format_reward_staging": 0.9375, + "step": 157 + }, + { + "clip_ratio": 0.0, + "completion_length": 576.59375, + "epoch": 39.44444444444444, + "grad_norm": 1.4775879396602463, + "kl": 0.054443359375, + "learning_rate": 7.083909302476452e-07, + "loss": 0.0164, + "reward": 3.71875, + "reward_std": 1.9704924821853638, + "rewards/accuracy_reward_staging": 0.40625, + "rewards/format_reward": 0.78125, + "rewards/format_reward_staging": 0.90625, + "step": 158 + }, + { + "clip_ratio": 0.0, + "completion_length": 555.8125, + "epoch": 39.666666666666664, + "grad_norm": 1.7649876199956425, + "kl": 0.0699462890625, + "learning_rate": 7.04626314255447e-07, + "loss": 0.0019, + "reward": 4.4375, + "reward_std": 2.7981574535369873, + "rewards/accuracy_reward_staging": 0.5, + "rewards/format_reward": 0.9375, + "rewards/format_reward_staging": 1.0, + "step": 159 + }, + { + "clip_ratio": 0.0, + "completion_length": 498.75, + "epoch": 39.888888888888886, + "grad_norm": 1.3915513369029784, + "kl": 0.0543212890625, + "learning_rate": 7.008477123264847e-07, + "loss": 0.0433, + "reward": 2.90625, + "reward_std": 1.3342358469963074, + "rewards/accuracy_reward_staging": 0.1875, + "rewards/format_reward": 0.96875, + "rewards/format_reward_staging": 1.0, + "step": 160 + }, + { + "clip_ratio": 0.0, + "completion_length": 569.53125, + "epoch": 40.22222222222222, + "grad_norm": 1.562840542671513, + "kl": 0.053955078125, + "learning_rate": 6.970553827231808e-07, + "loss": 0.0164, + "reward": 4.625, + "reward_std": 2.55762779712677, + "rewards/accuracy_reward_staging": 0.53125, + "rewards/format_reward": 0.96875, + "rewards/format_reward_staging": 1.0, + "step": 161 + }, + { + "clip_ratio": 0.0, + "completion_length": 538.3125, + "epoch": 40.44444444444444, + "grad_norm": 1.4692239574350316, + "kl": 0.0526123046875, + "learning_rate": 6.932495846462261e-07, + "loss": -0.0164, + "reward": 3.65625, + "reward_std": 1.8189646005630493, + "rewards/accuracy_reward_staging": 0.34375, + "rewards/format_reward": 0.96875, + "rewards/format_reward_staging": 0.96875, + "step": 162 + }, + { + "clip_ratio": 0.0, + "completion_length": 531.875, + "epoch": 40.666666666666664, + "grad_norm": 1.5332016106483515, + "kl": 0.05316162109375, + "learning_rate": 6.894305782168638e-07, + "loss": -0.0429, + "reward": 4.3125, + "reward_std": 2.5211293697357178, + "rewards/accuracy_reward_staging": 0.46875, + "rewards/format_reward": 0.96875, + "rewards/format_reward_staging": 1.0, + "step": 163 + }, + { + "clip_ratio": 0.0, + "completion_length": 545.125, + "epoch": 40.888888888888886, + "grad_norm": 13.136996472534078, + "kl": 0.11846923828125, + "learning_rate": 6.855986244591103e-07, + "loss": -0.0235, + "reward": 3.28125, + "reward_std": 2.338345527648926, + "rewards/accuracy_reward_staging": 0.3125, + "rewards/format_reward": 0.8125, + "rewards/format_reward_staging": 0.90625, + "step": 164 + }, + { + "clip_ratio": 0.0, + "completion_length": 543.40625, + "epoch": 41.22222222222222, + "grad_norm": 1.3242743159265937, + "kl": 0.04998779296875, + "learning_rate": 6.817539852819148e-07, + "loss": 0.0115, + "reward": 3.1875, + "reward_std": 1.375, + "rewards/accuracy_reward_staging": 0.25, + "rewards/format_reward": 0.96875, + "rewards/format_reward_staging": 0.96875, + "step": 165 + }, + { + "clip_ratio": 0.0, + "completion_length": 520.1875, + "epoch": 41.44444444444444, + "grad_norm": 1.2491437366023406, + "kl": 0.05328369140625, + "learning_rate": 6.778969234612583e-07, + "loss": 0.0198, + "reward": 4.84375, + "reward_std": 1.7444601655006409, + "rewards/accuracy_reward_staging": 0.59375, + "rewards/format_reward": 0.90625, + "rewards/format_reward_staging": 0.96875, + "step": 166 + }, + { + "clip_ratio": 0.0, + "completion_length": 619.59375, + "epoch": 41.666666666666664, + "grad_norm": 1.6792411977674075, + "kl": 0.05487060546875, + "learning_rate": 6.740277026221922e-07, + "loss": 0.011, + "reward": 3.21875, + "reward_std": 2.509488582611084, + "rewards/accuracy_reward_staging": 0.28125, + "rewards/format_reward": 0.84375, + "rewards/format_reward_staging": 0.96875, + "step": 167 + }, + { + "clip_ratio": 0.0, + "completion_length": 539.125, + "epoch": 41.888888888888886, + "grad_norm": 2.61987913810964, + "kl": 0.08526611328125, + "learning_rate": 6.701465872208216e-07, + "loss": 0.0355, + "reward": 5.71875, + "reward_std": 2.992280900478363, + "rewards/accuracy_reward_staging": 0.78125, + "rewards/format_reward": 0.8125, + "rewards/format_reward_staging": 1.0, + "step": 168 + }, + { + "clip_ratio": 0.0, + "completion_length": 601.65625, + "epoch": 42.22222222222222, + "grad_norm": 1.5317010015458066, + "kl": 0.0543212890625, + "learning_rate": 6.662538425262284e-07, + "loss": -0.0412, + "reward": 3.75, + "reward_std": 2.802945911884308, + "rewards/accuracy_reward_staging": 0.375, + "rewards/format_reward": 0.90625, + "rewards/format_reward_staging": 0.96875, + "step": 169 + }, + { + "clip_ratio": 0.0, + "completion_length": 607.5, + "epoch": 42.44444444444444, + "grad_norm": 1.5445749846218586, + "kl": 0.05462646484375, + "learning_rate": 6.623497346023417e-07, + "loss": -0.0053, + "reward": 3.0625, + "reward_std": 1.4321783781051636, + "rewards/accuracy_reward_staging": 0.25, + "rewards/format_reward": 0.90625, + "rewards/format_reward_staging": 0.90625, + "step": 170 + }, + { + "clip_ratio": 0.0, + "completion_length": 531.53125, + "epoch": 42.666666666666664, + "grad_norm": 1.727090236953967, + "kl": 0.05303955078125, + "learning_rate": 6.584345302897522e-07, + "loss": 0.0752, + "reward": 4.9375, + "reward_std": 2.6843830347061157, + "rewards/accuracy_reward_staging": 0.59375, + "rewards/format_reward": 0.96875, + "rewards/format_reward_staging": 1.0, + "step": 171 + }, + { + "clip_ratio": 0.0, + "completion_length": 511.125, + "epoch": 42.888888888888886, + "grad_norm": 1.463526052255072, + "kl": 0.05108642578125, + "learning_rate": 6.545084971874736e-07, + "loss": -0.0218, + "reward": 4.28125, + "reward_std": 2.3289482593536377, + "rewards/accuracy_reward_staging": 0.46875, + "rewards/format_reward": 0.9375, + "rewards/format_reward_staging": 1.0, + "step": 172 + }, + { + "clip_ratio": 0.0, + "completion_length": 591.46875, + "epoch": 43.22222222222222, + "grad_norm": 1.6155726361043279, + "kl": 0.06103515625, + "learning_rate": 6.505719036346537e-07, + "loss": 0.0385, + "reward": 3.3125, + "reward_std": 2.2124131619930267, + "rewards/accuracy_reward_staging": 0.3125, + "rewards/format_reward": 0.84375, + "rewards/format_reward_staging": 0.90625, + "step": 173 + }, + { + "clip_ratio": 0.0, + "completion_length": 570.96875, + "epoch": 43.44444444444444, + "grad_norm": 1.3751712901264466, + "kl": 0.0545654296875, + "learning_rate": 6.466250186922324e-07, + "loss": 0.0063, + "reward": 3.1875, + "reward_std": 2.130874752998352, + "rewards/accuracy_reward_staging": 0.3125, + "rewards/format_reward": 0.78125, + "rewards/format_reward_staging": 0.84375, + "step": 174 + }, + { + "clip_ratio": 0.0, + "completion_length": 545.71875, + "epoch": 43.666666666666664, + "grad_norm": 1.4756595692040109, + "kl": 0.059326171875, + "learning_rate": 6.426681121245527e-07, + "loss": -0.0295, + "reward": 3.59375, + "reward_std": 2.3869778215885162, + "rewards/accuracy_reward_staging": 0.34375, + "rewards/format_reward": 0.9375, + "rewards/format_reward_staging": 0.9375, + "step": 175 + }, + { + "clip_ratio": 0.0, + "completion_length": 595.1875, + "epoch": 43.888888888888886, + "grad_norm": 1.4156928353056575, + "kl": 0.050048828125, + "learning_rate": 6.387014543809223e-07, + "loss": -0.0245, + "reward": 3.625, + "reward_std": 2.184383064508438, + "rewards/accuracy_reward_staging": 0.34375, + "rewards/format_reward": 0.96875, + "rewards/format_reward_staging": 0.9375, + "step": 176 + }, + { + "clip_ratio": 0.0, + "completion_length": 549.28125, + "epoch": 44.22222222222222, + "grad_norm": 1.6625979237822903, + "kl": 0.05389404296875, + "learning_rate": 6.347253165771289e-07, + "loss": 0.0393, + "reward": 4.34375, + "reward_std": 2.0728103518486023, + "rewards/accuracy_reward_staging": 0.46875, + "rewards/format_reward": 1.0, + "rewards/format_reward_staging": 1.0, + "step": 177 + }, + { + "clip_ratio": 0.0, + "completion_length": 616.4375, + "epoch": 44.44444444444444, + "grad_norm": 0.9016620089051227, + "kl": 0.04852294921875, + "learning_rate": 6.307399704769098e-07, + "loss": 0.0327, + "reward": 3.3125, + "reward_std": 1.9239110946655273, + "rewards/accuracy_reward_staging": 0.28125, + "rewards/format_reward": 0.9375, + "rewards/format_reward_staging": 0.96875, + "step": 178 + }, + { + "clip_ratio": 0.0, + "completion_length": 554.40625, + "epoch": 44.666666666666664, + "grad_norm": 1.4445648538792832, + "kl": 0.06365966796875, + "learning_rate": 6.26745688473377e-07, + "loss": 0.0527, + "reward": 2.90625, + "reward_std": 1.2700245678424835, + "rewards/accuracy_reward_staging": 0.21875, + "rewards/format_reward": 0.875, + "rewards/format_reward_staging": 0.9375, + "step": 179 + }, + { + "clip_ratio": 0.0, + "completion_length": 596.6875, + "epoch": 44.888888888888886, + "grad_norm": 1.5856705116806837, + "kl": 0.06280517578125, + "learning_rate": 6.227427435703995e-07, + "loss": 0.0488, + "reward": 3.59375, + "reward_std": 2.1598991453647614, + "rewards/accuracy_reward_staging": 0.34375, + "rewards/format_reward": 0.875, + "rewards/format_reward_staging": 1.0, + "step": 180 + }, + { + "clip_ratio": 0.0, + "completion_length": 540.375, + "epoch": 45.22222222222222, + "grad_norm": 1.4832995345417785, + "kl": 0.0472412109375, + "learning_rate": 6.187314093639443e-07, + "loss": 0.021, + "reward": 3.8125, + "reward_std": 2.2678900957107544, + "rewards/accuracy_reward_staging": 0.375, + "rewards/format_reward": 0.96875, + "rewards/format_reward_staging": 0.96875, + "step": 181 + }, + { + "clip_ratio": 0.0, + "completion_length": 596.75, + "epoch": 45.44444444444444, + "grad_norm": 1.5871845074006228, + "kl": 0.048828125, + "learning_rate": 6.147119600233758e-07, + "loss": -0.025, + "reward": 4.40625, + "reward_std": 2.732926845550537, + "rewards/accuracy_reward_staging": 0.5, + "rewards/format_reward": 0.96875, + "rewards/format_reward_staging": 0.9375, + "step": 182 + }, + { + "clip_ratio": 0.0, + "completion_length": 636.21875, + "epoch": 45.666666666666664, + "grad_norm": 1.1682267885626447, + "kl": 0.050537109375, + "learning_rate": 6.106846702727172e-07, + "loss": -0.0041, + "reward": 3.5625, + "reward_std": 1.9367179870605469, + "rewards/accuracy_reward_staging": 0.34375, + "rewards/format_reward": 0.875, + "rewards/format_reward_staging": 0.96875, + "step": 183 + }, + { + "clip_ratio": 0.0, + "completion_length": 567.6875, + "epoch": 45.888888888888886, + "grad_norm": 1.182505436622169, + "kl": 0.052490234375, + "learning_rate": 6.066498153718734e-07, + "loss": -0.0104, + "reward": 3.96875, + "reward_std": 1.8926886320114136, + "rewards/accuracy_reward_staging": 0.40625, + "rewards/format_reward": 0.96875, + "rewards/format_reward_staging": 0.96875, + "step": 184 + }, + { + "clip_ratio": 0.0, + "completion_length": 556.84375, + "epoch": 46.22222222222222, + "grad_norm": 74.95843070592915, + "kl": 0.51153564453125, + "learning_rate": 6.026076710978171e-07, + "loss": -0.0099, + "reward": 4.03125, + "reward_std": 2.5020731687545776, + "rewards/accuracy_reward_staging": 0.4375, + "rewards/format_reward": 0.90625, + "rewards/format_reward_staging": 0.9375, + "step": 185 + }, + { + "clip_ratio": 0.0, + "completion_length": 635.9375, + "epoch": 46.44444444444444, + "grad_norm": 1.1802575443084546, + "kl": 0.046630859375, + "learning_rate": 5.985585137257401e-07, + "loss": -0.0104, + "reward": 3.75, + "reward_std": 1.5358919501304626, + "rewards/accuracy_reward_staging": 0.375, + "rewards/format_reward": 0.9375, + "rewards/format_reward_staging": 0.9375, + "step": 186 + }, + { + "clip_ratio": 0.0, + "completion_length": 584.625, + "epoch": 46.666666666666664, + "grad_norm": 1.540364554923698, + "kl": 0.053955078125, + "learning_rate": 5.945026200101702e-07, + "loss": 0.0173, + "reward": 3.71875, + "reward_std": 2.7078438997268677, + "rewards/accuracy_reward_staging": 0.375, + "rewards/format_reward": 0.90625, + "rewards/format_reward_staging": 0.9375, + "step": 187 + }, + { + "clip_ratio": 0.0, + "completion_length": 511.375, + "epoch": 46.888888888888886, + "grad_norm": 1.3487938182691792, + "kl": 0.05859375, + "learning_rate": 5.90440267166055e-07, + "loss": 0.0363, + "reward": 3.125, + "reward_std": 2.2170365154743195, + "rewards/accuracy_reward_staging": 0.28125, + "rewards/format_reward": 0.75, + "rewards/format_reward_staging": 0.96875, + "step": 188 + }, + { + "clip_ratio": 0.0, + "completion_length": 536.40625, + "epoch": 47.22222222222222, + "grad_norm": 1.7030200868844614, + "kl": 0.054931640625, + "learning_rate": 5.863717328498152e-07, + "loss": 0.0328, + "reward": 3.84375, + "reward_std": 2.070079743862152, + "rewards/accuracy_reward_staging": 0.40625, + "rewards/format_reward": 0.84375, + "rewards/format_reward_staging": 0.96875, + "step": 189 + }, + { + "clip_ratio": 0.0, + "completion_length": 555.875, + "epoch": 47.44444444444444, + "grad_norm": 1.7566836455673576, + "kl": 0.05218505859375, + "learning_rate": 5.82297295140367e-07, + "loss": -0.0381, + "reward": 3.75, + "reward_std": 2.009314328432083, + "rewards/accuracy_reward_staging": 0.375, + "rewards/format_reward": 0.90625, + "rewards/format_reward_staging": 0.96875, + "step": 190 + }, + { + "clip_ratio": 0.0, + "completion_length": 545.8125, + "epoch": 47.666666666666664, + "grad_norm": 1.594063347049537, + "kl": 0.05426025390625, + "learning_rate": 5.782172325201155e-07, + "loss": 0.0535, + "reward": 3.21875, + "reward_std": 1.7700316905975342, + "rewards/accuracy_reward_staging": 0.25, + "rewards/format_reward": 0.96875, + "rewards/format_reward_staging": 1.0, + "step": 191 + }, + { + "clip_ratio": 0.0, + "completion_length": 622.8125, + "epoch": 47.888888888888886, + "grad_norm": 1.5439318867500331, + "kl": 0.04937744140625, + "learning_rate": 5.741318238559209e-07, + "loss": -0.0012, + "reward": 4.75, + "reward_std": 2.4349581599235535, + "rewards/accuracy_reward_staging": 0.5625, + "rewards/format_reward": 0.9375, + "rewards/format_reward_staging": 1.0, + "step": 192 + }, + { + "clip_ratio": 0.0, + "completion_length": 583.75, + "epoch": 48.22222222222222, + "grad_norm": 2.5201319810344454, + "kl": 0.0770263671875, + "learning_rate": 5.700413483800389e-07, + "loss": -0.0762, + "reward": 3.4375, + "reward_std": 1.82216876745224, + "rewards/accuracy_reward_staging": 0.34375, + "rewards/format_reward": 0.8125, + "rewards/format_reward_staging": 0.90625, + "step": 193 + }, + { + "clip_ratio": 0.0, + "completion_length": 599.59375, + "epoch": 48.44444444444444, + "grad_norm": 1.473198815087056, + "kl": 0.05352783203125, + "learning_rate": 5.659460856710345e-07, + "loss": -0.0055, + "reward": 3.5625, + "reward_std": 1.9599019289016724, + "rewards/accuracy_reward_staging": 0.34375, + "rewards/format_reward": 0.90625, + "rewards/format_reward_staging": 0.9375, + "step": 194 + }, + { + "clip_ratio": 0.0, + "completion_length": 569.75, + "epoch": 48.666666666666664, + "grad_norm": 1.6168573027198114, + "kl": 0.05010986328125, + "learning_rate": 5.618463156346739e-07, + "loss": -0.0075, + "reward": 4.21875, + "reward_std": 1.739636391401291, + "rewards/accuracy_reward_staging": 0.46875, + "rewards/format_reward": 0.9375, + "rewards/format_reward_staging": 0.9375, + "step": 195 + }, + { + "clip_ratio": 0.0, + "completion_length": 577.375, + "epoch": 48.888888888888886, + "grad_norm": 1.5839729942600627, + "kl": 0.04180908203125, + "learning_rate": 5.577423184847931e-07, + "loss": 0.0086, + "reward": 3.875, + "reward_std": 2.332531690597534, + "rewards/accuracy_reward_staging": 0.375, + "rewards/format_reward": 1.0, + "rewards/format_reward_staging": 1.0, + "step": 196 + }, + { + "clip_ratio": 0.0, + "completion_length": 532.96875, + "epoch": 49.22222222222222, + "grad_norm": 1.5767088515541903, + "kl": 0.04962158203125, + "learning_rate": 5.536343747241459e-07, + "loss": 0.0159, + "reward": 4.15625, + "reward_std": 1.9809716939926147, + "rewards/accuracy_reward_staging": 0.4375, + "rewards/format_reward": 0.96875, + "rewards/format_reward_staging": 1.0, + "step": 197 + }, + { + "clip_ratio": 0.0, + "completion_length": 578.28125, + "epoch": 49.44444444444444, + "grad_norm": 1.3049917889915577, + "kl": 0.04583740234375, + "learning_rate": 5.495227651252315e-07, + "loss": 0.0386, + "reward": 4.53125, + "reward_std": 1.7373294830322266, + "rewards/accuracy_reward_staging": 0.53125, + "rewards/format_reward": 0.90625, + "rewards/format_reward_staging": 0.96875, + "step": 198 + }, + { + "clip_ratio": 0.0, + "completion_length": 546.0625, + "epoch": 49.666666666666664, + "grad_norm": 1.3164741992532543, + "kl": 0.0504150390625, + "learning_rate": 5.454077707111041e-07, + "loss": 0.0142, + "reward": 4.65625, + "reward_std": 1.945079743862152, + "rewards/accuracy_reward_staging": 0.53125, + "rewards/format_reward": 1.0, + "rewards/format_reward_staging": 1.0, + "step": 199 + }, + { + "epoch": 49.888888888888886, + "grad_norm": 1.3350172420738584, + "learning_rate": 5.412896727361662e-07, + "loss": 0.0656, + "step": 200 + }, + { + "epoch": 49.888888888888886, + "eval_clip_ratio": 0.0, + "eval_completion_length": 600.85, + "eval_kl": 0.047802734375, + "eval_loss": 0.025471828877925873, + "eval_reward": 2.6, + "eval_reward_std": 1.3353363513946532, + "eval_rewards/accuracy_reward_staging": 0.15, + "eval_rewards/format_reward": 0.9, + "eval_rewards/format_reward_staging": 0.95, + "eval_runtime": 52.2669, + "eval_samples_per_second": 0.689, + "eval_steps_per_second": 0.096, + "step": 200 + }, + { + "clip_ratio": 0.0, + "completion_length": 614.453125, + "epoch": 50.22222222222222, + "grad_norm": 1.284721283794701, + "kl": 0.05389404296875, + "learning_rate": 5.371687526669439e-07, + "loss": 0.0086, + "reward": 3.421875, + "reward_std": 2.202674761414528, + "rewards/accuracy_reward_staging": 0.34375, + "rewards/format_reward": 0.8125, + "rewards/format_reward_staging": 0.890625, + "step": 201 + }, + { + "clip_ratio": 0.0, + "completion_length": 545.53125, + "epoch": 50.44444444444444, + "grad_norm": 1.235503506247465, + "kl": 0.0528564453125, + "learning_rate": 5.330452921628497e-07, + "loss": -0.0137, + "reward": 3.5625, + "reward_std": 1.246154248714447, + "rewards/accuracy_reward_staging": 0.34375, + "rewards/format_reward": 0.875, + "rewards/format_reward_staging": 0.96875, + "step": 202 + }, + { + "clip_ratio": 0.0, + "completion_length": 560.25, + "epoch": 50.666666666666664, + "grad_norm": 1.9492031211380043, + "kl": 0.0654296875, + "learning_rate": 5.28919573056932e-07, + "loss": -0.049, + "reward": 4.28125, + "reward_std": 2.934589922428131, + "rewards/accuracy_reward_staging": 0.5, + "rewards/format_reward": 0.875, + "rewards/format_reward_staging": 0.90625, + "step": 203 + }, + { + "clip_ratio": 0.0, + "completion_length": 599.3125, + "epoch": 50.888888888888886, + "grad_norm": 1.567290502007258, + "kl": 0.04364013671875, + "learning_rate": 5.247918773366111e-07, + "loss": 0.0937, + "reward": 3.875, + "reward_std": 1.930722177028656, + "rewards/accuracy_reward_staging": 0.40625, + "rewards/format_reward": 0.90625, + "rewards/format_reward_staging": 0.9375, + "step": 204 + }, + { + "clip_ratio": 0.0, + "completion_length": 562.6875, + "epoch": 51.22222222222222, + "grad_norm": 1.4477817793212922, + "kl": 0.05084228515625, + "learning_rate": 5.206624871244065e-07, + "loss": 0.0148, + "reward": 2.90625, + "reward_std": 1.4091877937316895, + "rewards/accuracy_reward_staging": 0.1875, + "rewards/format_reward": 0.96875, + "rewards/format_reward_staging": 1.0, + "step": 205 + }, + { + "clip_ratio": 0.0, + "completion_length": 605.625, + "epoch": 51.44444444444444, + "grad_norm": 1.5674813338685252, + "kl": 0.04931640625, + "learning_rate": 5.165316846586541e-07, + "loss": 0.0963, + "reward": 3.125, + "reward_std": 2.1649354100227356, + "rewards/accuracy_reward_staging": 0.28125, + "rewards/format_reward": 0.78125, + "rewards/format_reward_staging": 0.9375, + "step": 206 + }, + { + "clip_ratio": 0.0, + "completion_length": 572.5625, + "epoch": 51.666666666666664, + "grad_norm": 1.521375079838418, + "kl": 0.046875, + "learning_rate": 5.123997522742151e-07, + "loss": 0.0215, + "reward": 3.71875, + "reward_std": 2.047757565975189, + "rewards/accuracy_reward_staging": 0.375, + "rewards/format_reward": 0.90625, + "rewards/format_reward_staging": 0.9375, + "step": 207 + }, + { + "clip_ratio": 0.0, + "completion_length": 524.4375, + "epoch": 51.888888888888886, + "grad_norm": 1.637742061840183, + "kl": 0.04779052734375, + "learning_rate": 5.082669723831793e-07, + "loss": -0.0249, + "reward": 3.59375, + "reward_std": 2.858625650405884, + "rewards/accuracy_reward_staging": 0.34375, + "rewards/format_reward": 0.90625, + "rewards/format_reward_staging": 0.96875, + "step": 208 + }, + { + "clip_ratio": 0.0, + "completion_length": 566.875, + "epoch": 52.22222222222222, + "grad_norm": 1.5832843072882397, + "kl": 0.04449462890625, + "learning_rate": 5.041336274555625e-07, + "loss": -0.063, + "reward": 2.84375, + "reward_std": 1.2771694660186768, + "rewards/accuracy_reward_staging": 0.1875, + "rewards/format_reward": 0.9375, + "rewards/format_reward_staging": 0.96875, + "step": 209 + }, + { + "clip_ratio": 0.0, + "completion_length": 576.0625, + "epoch": 52.44444444444444, + "grad_norm": 1.5508316387978383, + "kl": 0.06103515625, + "learning_rate": 5e-07, + "loss": -0.0291, + "reward": 4.0, + "reward_std": 2.082531690597534, + "rewards/accuracy_reward_staging": 0.40625, + "rewards/format_reward": 0.96875, + "rewards/format_reward_staging": 1.0, + "step": 210 + }, + { + "clip_ratio": 0.0, + "completion_length": 611.71875, + "epoch": 52.666666666666664, + "grad_norm": 1.6164552079690877, + "kl": 0.04437255859375, + "learning_rate": 4.958663725444375e-07, + "loss": 0.0102, + "reward": 4.40625, + "reward_std": 2.5580477714538574, + "rewards/accuracy_reward_staging": 0.5, + "rewards/format_reward": 0.9375, + "rewards/format_reward_staging": 0.96875, + "step": 211 + }, + { + "clip_ratio": 0.0, + "completion_length": 563.3125, + "epoch": 52.888888888888886, + "grad_norm": 1.5726439650006456, + "kl": 0.05096435546875, + "learning_rate": 4.917330276168208e-07, + "loss": -0.0031, + "reward": 4.96875, + "reward_std": 2.3175911903381348, + "rewards/accuracy_reward_staging": 0.625, + "rewards/format_reward": 0.84375, + "rewards/format_reward_staging": 1.0, + "step": 212 + }, + { + "clip_ratio": 0.0, + "completion_length": 616.375, + "epoch": 53.22222222222222, + "grad_norm": 1.7880025106936461, + "kl": 0.04498291015625, + "learning_rate": 4.87600247725785e-07, + "loss": 0.066, + "reward": 3.1875, + "reward_std": 1.891027882695198, + "rewards/accuracy_reward_staging": 0.28125, + "rewards/format_reward": 0.875, + "rewards/format_reward_staging": 0.90625, + "step": 213 + }, + { + "clip_ratio": 0.0, + "completion_length": 556.6875, + "epoch": 53.44444444444444, + "grad_norm": 2.0232137942713573, + "kl": 0.0498046875, + "learning_rate": 4.834683153413459e-07, + "loss": 0.0311, + "reward": 3.5625, + "reward_std": 1.6434174478054047, + "rewards/accuracy_reward_staging": 0.34375, + "rewards/format_reward": 0.875, + "rewards/format_reward_staging": 0.96875, + "step": 214 + }, + { + "clip_ratio": 0.0, + "completion_length": 555.09375, + "epoch": 53.666666666666664, + "grad_norm": 1.4253180533139413, + "kl": 0.0416259765625, + "learning_rate": 4.793375128755933e-07, + "loss": -0.0401, + "reward": 4.03125, + "reward_std": 2.570079743862152, + "rewards/accuracy_reward_staging": 0.40625, + "rewards/format_reward": 1.0, + "rewards/format_reward_staging": 1.0, + "step": 215 + }, + { + "clip_ratio": 0.0, + "completion_length": 597.4375, + "epoch": 53.888888888888886, + "grad_norm": 1.740974206086713, + "kl": 0.04815673828125, + "learning_rate": 4.752081226633888e-07, + "loss": -0.038, + "reward": 4.34375, + "reward_std": 2.6059716939926147, + "rewards/accuracy_reward_staging": 0.5, + "rewards/format_reward": 0.875, + "rewards/format_reward_staging": 0.96875, + "step": 216 + }, + { + "clip_ratio": 0.0, + "completion_length": 564.0625, + "epoch": 54.22222222222222, + "grad_norm": 1.6032171003103113, + "kl": 0.05572509765625, + "learning_rate": 4.71080426943068e-07, + "loss": 0.0092, + "reward": 3.0625, + "reward_std": 1.996816635131836, + "rewards/accuracy_reward_staging": 0.28125, + "rewards/format_reward": 0.75, + "rewards/format_reward_staging": 0.90625, + "step": 217 + }, + { + "clip_ratio": 0.0, + "completion_length": 530.03125, + "epoch": 54.44444444444444, + "grad_norm": 1.3127356050754018, + "kl": 0.0540771484375, + "learning_rate": 4.669547078371503e-07, + "loss": -0.0245, + "reward": 6.59375, + "reward_std": 2.073159486055374, + "rewards/accuracy_reward_staging": 0.9375, + "rewards/format_reward": 0.9375, + "rewards/format_reward_staging": 0.96875, + "step": 218 + }, + { + "clip_ratio": 0.0, + "completion_length": 608.3125, + "epoch": 54.666666666666664, + "grad_norm": 1.7016991990394592, + "kl": 0.05010986328125, + "learning_rate": 4.628312473330562e-07, + "loss": 0.0702, + "reward": 3.875, + "reward_std": 2.482748866081238, + "rewards/accuracy_reward_staging": 0.40625, + "rewards/format_reward": 0.875, + "rewards/format_reward_staging": 0.96875, + "step": 219 + }, + { + "clip_ratio": 0.0, + "completion_length": 549.375, + "epoch": 54.888888888888886, + "grad_norm": 1.3141608271515532, + "kl": 0.04742431640625, + "learning_rate": 4.5871032726383385e-07, + "loss": 0.0552, + "reward": 3.125, + "reward_std": 1.3886407911777496, + "rewards/accuracy_reward_staging": 0.25, + "rewards/format_reward": 0.90625, + "rewards/format_reward_staging": 0.96875, + "step": 220 + }, + { + "clip_ratio": 0.0, + "completion_length": 578.71875, + "epoch": 55.22222222222222, + "grad_norm": 1.3932263109990592, + "kl": 0.04364013671875, + "learning_rate": 4.5459222928889587e-07, + "loss": 0.051, + "reward": 3.71875, + "reward_std": 1.7805703282356262, + "rewards/accuracy_reward_staging": 0.375, + "rewards/format_reward": 0.90625, + "rewards/format_reward_staging": 0.9375, + "step": 221 + }, + { + "clip_ratio": 0.0, + "completion_length": 588.4375, + "epoch": 55.44444444444444, + "grad_norm": 1.5339621078239263, + "kl": 0.04962158203125, + "learning_rate": 4.5047723487476864e-07, + "loss": -0.0216, + "reward": 3.46875, + "reward_std": 2.488185405731201, + "rewards/accuracy_reward_staging": 0.3125, + "rewards/format_reward": 0.90625, + "rewards/format_reward_staging": 1.0, + "step": 222 + }, + { + "clip_ratio": 0.0, + "completion_length": 585.34375, + "epoch": 55.666666666666664, + "grad_norm": 1.6607509386936015, + "kl": 0.04962158203125, + "learning_rate": 4.463656252758542e-07, + "loss": 0.0452, + "reward": 3.8125, + "reward_std": 2.171033263206482, + "rewards/accuracy_reward_staging": 0.40625, + "rewards/format_reward": 0.84375, + "rewards/format_reward_staging": 0.9375, + "step": 223 + }, + { + "clip_ratio": 0.0, + "completion_length": 631.75, + "epoch": 55.888888888888886, + "grad_norm": 1.5614778713632624, + "kl": 0.04669189453125, + "learning_rate": 4.4225768151520694e-07, + "loss": 0.0801, + "reward": 3.5625, + "reward_std": 2.430722177028656, + "rewards/accuracy_reward_staging": 0.34375, + "rewards/format_reward": 0.90625, + "rewards/format_reward_staging": 0.9375, + "step": 224 + }, + { + "clip_ratio": 0.0, + "completion_length": 613.75, + "epoch": 56.22222222222222, + "grad_norm": 1.5004046938088074, + "kl": 0.05950927734375, + "learning_rate": 4.381536843653261e-07, + "loss": 0.0698, + "reward": 3.59375, + "reward_std": 2.5734615325927734, + "rewards/accuracy_reward_staging": 0.375, + "rewards/format_reward": 0.78125, + "rewards/format_reward_staging": 0.9375, + "step": 225 + }, + { + "clip_ratio": 0.0, + "completion_length": 565.375, + "epoch": 56.44444444444444, + "grad_norm": 1.3766714019303354, + "kl": 0.04168701171875, + "learning_rate": 4.340539143289655e-07, + "loss": 0.0233, + "reward": 3.5, + "reward_std": 2.0, + "rewards/accuracy_reward_staging": 0.3125, + "rewards/format_reward": 0.96875, + "rewards/format_reward_staging": 0.96875, + "step": 226 + }, + { + "clip_ratio": 0.0, + "completion_length": 553.125, + "epoch": 56.666666666666664, + "grad_norm": 1.307050706736634, + "kl": 0.05133056640625, + "learning_rate": 4.2995865161996104e-07, + "loss": 0.0181, + "reward": 4.0625, + "reward_std": 2.421202301979065, + "rewards/accuracy_reward_staging": 0.4375, + "rewards/format_reward": 0.90625, + "rewards/format_reward_staging": 0.96875, + "step": 227 + }, + { + "clip_ratio": 0.0, + "completion_length": 516.4375, + "epoch": 56.888888888888886, + "grad_norm": 1.5405733998671278, + "kl": 0.0562744140625, + "learning_rate": 4.258681761440789e-07, + "loss": 0.0017, + "reward": 4.03125, + "reward_std": 2.49512779712677, + "rewards/accuracy_reward_staging": 0.40625, + "rewards/format_reward": 1.0, + "rewards/format_reward_staging": 1.0, + "step": 228 + }, + { + "clip_ratio": 0.0, + "completion_length": 621.84375, + "epoch": 57.22222222222222, + "grad_norm": 1.606949877632979, + "kl": 0.044189453125, + "learning_rate": 4.2178276747988444e-07, + "loss": -0.0076, + "reward": 4.3125, + "reward_std": 2.390491783618927, + "rewards/accuracy_reward_staging": 0.5, + "rewards/format_reward": 0.90625, + "rewards/format_reward_staging": 0.90625, + "step": 229 + }, + { + "clip_ratio": 0.0, + "completion_length": 597.84375, + "epoch": 57.44444444444444, + "grad_norm": 1.5411205221206894, + "kl": 0.0574951171875, + "learning_rate": 4.1770270485963294e-07, + "loss": -0.0387, + "reward": 3.125, + "reward_std": 2.1638975143432617, + "rewards/accuracy_reward_staging": 0.25, + "rewards/format_reward": 0.90625, + "rewards/format_reward_staging": 0.96875, + "step": 230 + }, + { + "clip_ratio": 0.0, + "completion_length": 603.1875, + "epoch": 57.666666666666664, + "grad_norm": 1.3383276534008064, + "kl": 0.04473876953125, + "learning_rate": 4.1362826715018497e-07, + "loss": 0.0122, + "reward": 3.6875, + "reward_std": 1.9202269613742828, + "rewards/accuracy_reward_staging": 0.375, + "rewards/format_reward": 0.90625, + "rewards/format_reward_staging": 0.90625, + "step": 231 + }, + { + "clip_ratio": 0.0, + "completion_length": 552.25, + "epoch": 57.888888888888886, + "grad_norm": 1.7484795613881616, + "kl": 0.06341552734375, + "learning_rate": 4.095597328339452e-07, + "loss": -0.0426, + "reward": 4.46875, + "reward_std": 2.5560158491134644, + "rewards/accuracy_reward_staging": 0.53125, + "rewards/format_reward": 0.84375, + "rewards/format_reward_staging": 0.96875, + "step": 232 + }, + { + "clip_ratio": 0.0, + "completion_length": 585.125, + "epoch": 58.22222222222222, + "grad_norm": 1.5442704440086175, + "kl": 0.05377197265625, + "learning_rate": 4.0549737998982994e-07, + "loss": -0.0062, + "reward": 3.65625, + "reward_std": 2.2512659430503845, + "rewards/accuracy_reward_staging": 0.34375, + "rewards/format_reward": 0.96875, + "rewards/format_reward_staging": 0.96875, + "step": 233 + }, + { + "clip_ratio": 0.0, + "completion_length": 601.59375, + "epoch": 58.44444444444444, + "grad_norm": 1.3070749287077408, + "kl": 0.05706787109375, + "learning_rate": 4.0144148627425986e-07, + "loss": 0.0357, + "reward": 4.5625, + "reward_std": 2.173893690109253, + "rewards/accuracy_reward_staging": 0.53125, + "rewards/format_reward": 0.90625, + "rewards/format_reward_staging": 1.0, + "step": 234 + }, + { + "clip_ratio": 0.0, + "completion_length": 549.25, + "epoch": 58.666666666666664, + "grad_norm": 1.568215525888831, + "kl": 0.04644775390625, + "learning_rate": 3.973923289021829e-07, + "loss": -0.0236, + "reward": 3.375, + "reward_std": 2.125, + "rewards/accuracy_reward_staging": 0.28125, + "rewards/format_reward": 0.96875, + "rewards/format_reward_staging": 1.0, + "step": 235 + }, + { + "clip_ratio": 0.0, + "completion_length": 559.875, + "epoch": 58.888888888888886, + "grad_norm": 1.247655763308189, + "kl": 0.05523681640625, + "learning_rate": 3.9335018462812664e-07, + "loss": 0.0335, + "reward": 4.40625, + "reward_std": 1.7515006065368652, + "rewards/accuracy_reward_staging": 0.5, + "rewards/format_reward": 0.9375, + "rewards/format_reward_staging": 0.96875, + "step": 236 + }, + { + "clip_ratio": 0.0, + "completion_length": 608.75, + "epoch": 59.22222222222222, + "grad_norm": 1.4876134624852135, + "kl": 0.05291748046875, + "learning_rate": 3.893153297272828e-07, + "loss": 0.0246, + "reward": 3.28125, + "reward_std": 1.5280899405479431, + "rewards/accuracy_reward_staging": 0.3125, + "rewards/format_reward": 0.78125, + "rewards/format_reward_staging": 0.9375, + "step": 237 + }, + { + "clip_ratio": 0.0, + "completion_length": 583.0, + "epoch": 59.44444444444444, + "grad_norm": 1.6243663627358595, + "kl": 0.04718017578125, + "learning_rate": 3.8528803997662423e-07, + "loss": -0.0226, + "reward": 4.5625, + "reward_std": 2.9370444416999817, + "rewards/accuracy_reward_staging": 0.53125, + "rewards/format_reward": 0.9375, + "rewards/format_reward_staging": 0.96875, + "step": 238 + }, + { + "clip_ratio": 0.0, + "completion_length": 584.90625, + "epoch": 59.666666666666664, + "grad_norm": 1.579154131750563, + "kl": 0.05328369140625, + "learning_rate": 3.812685906360557e-07, + "loss": -0.0118, + "reward": 3.5625, + "reward_std": 1.8252411782741547, + "rewards/accuracy_reward_staging": 0.34375, + "rewards/format_reward": 0.875, + "rewards/format_reward_staging": 0.96875, + "step": 239 + }, + { + "clip_ratio": 0.0, + "completion_length": 602.875, + "epoch": 59.888888888888886, + "grad_norm": 1.6568742956735238, + "kl": 0.05029296875, + "learning_rate": 3.772572564296004e-07, + "loss": 0.0049, + "reward": 4.21875, + "reward_std": 2.6711304783821106, + "rewards/accuracy_reward_staging": 0.46875, + "rewards/format_reward": 0.90625, + "rewards/format_reward_staging": 0.96875, + "step": 240 + }, + { + "clip_ratio": 0.0, + "completion_length": 527.71875, + "epoch": 60.22222222222222, + "grad_norm": 1.5110623474715636, + "kl": 0.05316162109375, + "learning_rate": 3.7325431152662294e-07, + "loss": 0.004, + "reward": 3.65625, + "reward_std": 2.44047012925148, + "rewards/accuracy_reward_staging": 0.34375, + "rewards/format_reward": 0.96875, + "rewards/format_reward_staging": 0.96875, + "step": 241 + }, + { + "clip_ratio": 0.0, + "completion_length": 607.71875, + "epoch": 60.44444444444444, + "grad_norm": 1.5588742571636938, + "kl": 0.05126953125, + "learning_rate": 3.692600295230901e-07, + "loss": 0.0174, + "reward": 4.125, + "reward_std": 2.93262779712677, + "rewards/accuracy_reward_staging": 0.4375, + "rewards/format_reward": 0.96875, + "rewards/format_reward_staging": 0.96875, + "step": 242 + }, + { + "clip_ratio": 0.0, + "completion_length": 598.375, + "epoch": 60.666666666666664, + "grad_norm": 1.4200468362196192, + "kl": 0.05487060546875, + "learning_rate": 3.6527468342287096e-07, + "loss": 0.1256, + "reward": 3.8125, + "reward_std": 2.782258152961731, + "rewards/accuracy_reward_staging": 0.40625, + "rewards/format_reward": 0.875, + "rewards/format_reward_staging": 0.90625, + "step": 243 + }, + { + "clip_ratio": 0.0, + "completion_length": 598.9375, + "epoch": 60.888888888888886, + "grad_norm": 2.112230364965324, + "kl": 0.06414794921875, + "learning_rate": 3.612985456190778e-07, + "loss": -0.0099, + "reward": 4.0625, + "reward_std": 2.503733992576599, + "rewards/accuracy_reward_staging": 0.4375, + "rewards/format_reward": 0.875, + "rewards/format_reward_staging": 1.0, + "step": 244 + }, + { + "clip_ratio": 0.0, + "completion_length": 580.84375, + "epoch": 61.22222222222222, + "grad_norm": 1.5256554050376716, + "kl": 0.0540771484375, + "learning_rate": 3.5733188787544746e-07, + "loss": 0.0285, + "reward": 3.75, + "reward_std": 2.553140878677368, + "rewards/accuracy_reward_staging": 0.375, + "rewards/format_reward": 0.90625, + "rewards/format_reward_staging": 0.96875, + "step": 245 + }, + { + "clip_ratio": 0.0, + "completion_length": 553.53125, + "epoch": 61.44444444444444, + "grad_norm": 1.5714805767176323, + "kl": 0.0645751953125, + "learning_rate": 3.533749813077677e-07, + "loss": 0.0666, + "reward": 4.71875, + "reward_std": 2.595756232738495, + "rewards/accuracy_reward_staging": 0.59375, + "rewards/format_reward": 0.8125, + "rewards/format_reward_staging": 0.9375, + "step": 246 + }, + { + "clip_ratio": 0.0, + "completion_length": 570.90625, + "epoch": 61.666666666666664, + "grad_norm": 1.3717833382169582, + "kl": 0.05242919921875, + "learning_rate": 3.4942809636534633e-07, + "loss": 0.0464, + "reward": 4.375, + "reward_std": 1.9917186498641968, + "rewards/accuracy_reward_staging": 0.5, + "rewards/format_reward": 0.90625, + "rewards/format_reward_staging": 0.96875, + "step": 247 + }, + { + "clip_ratio": 0.0, + "completion_length": 648.6875, + "epoch": 61.888888888888886, + "grad_norm": 1.281888219474357, + "kl": 0.05694580078125, + "learning_rate": 3.454915028125263e-07, + "loss": -0.0053, + "reward": 4.1875, + "reward_std": 1.8432075381278992, + "rewards/accuracy_reward_staging": 0.46875, + "rewards/format_reward": 0.9375, + "rewards/format_reward_staging": 0.90625, + "step": 248 + }, + { + "clip_ratio": 0.0, + "completion_length": 601.28125, + "epoch": 62.22222222222222, + "grad_norm": 1.2189149322070956, + "kl": 0.05279541015625, + "learning_rate": 3.415654697102478e-07, + "loss": -0.0095, + "reward": 3.65625, + "reward_std": 1.4233438968658447, + "rewards/accuracy_reward_staging": 0.34375, + "rewards/format_reward": 0.96875, + "rewards/format_reward_staging": 0.96875, + "step": 249 + }, + { + "epoch": 62.44444444444444, + "grad_norm": 1.7869776388477054, + "learning_rate": 3.3765026539765827e-07, + "loss": 0.0694, + "step": 250 + }, + { + "epoch": 62.44444444444444, + "eval_clip_ratio": 0.0, + "eval_completion_length": 597.85, + "eval_kl": 0.050439453125, + "eval_loss": 0.033870112150907516, + "eval_reward": 2.5, + "eval_reward_std": 1.5911447525024414, + "eval_rewards/accuracy_reward_staging": 0.15, + "eval_rewards/format_reward": 0.825, + "eval_rewards/format_reward_staging": 0.925, + "eval_runtime": 53.5113, + "eval_samples_per_second": 0.673, + "eval_steps_per_second": 0.093, + "step": 250 + }, + { + "clip_ratio": 0.0, + "completion_length": 543.40625, + "epoch": 62.666666666666664, + "grad_norm": 1.580273768681387, + "kl": 0.058380126953125, + "learning_rate": 3.337461574737716e-07, + "loss": 0.0381, + "reward": 3.59375, + "reward_std": 1.963532954454422, + "rewards/accuracy_reward_staging": 0.359375, + "rewards/format_reward": 0.859375, + "rewards/format_reward_staging": 0.9375, + "step": 251 + }, + { + "clip_ratio": 0.0, + "completion_length": 629.9375, + "epoch": 62.888888888888886, + "grad_norm": 1.4446074061408753, + "kl": 0.04742431640625, + "learning_rate": 3.2985341277917846e-07, + "loss": 0.0576, + "reward": 3.5625, + "reward_std": 1.8048822581768036, + "rewards/accuracy_reward_staging": 0.34375, + "rewards/format_reward": 0.875, + "rewards/format_reward_staging": 0.96875, + "step": 252 + }, + { + "clip_ratio": 0.0, + "completion_length": 610.03125, + "epoch": 63.22222222222222, + "grad_norm": 2.4797282896452084, + "kl": 0.06011962890625, + "learning_rate": 3.2597229737780774e-07, + "loss": 0.0258, + "reward": 2.71875, + "reward_std": 1.841366171836853, + "rewards/accuracy_reward_staging": 0.21875, + "rewards/format_reward": 0.78125, + "rewards/format_reward_staging": 0.84375, + "step": 253 + }, + { + "clip_ratio": 0.0, + "completion_length": 535.8125, + "epoch": 63.44444444444444, + "grad_norm": 1.413545282954978, + "kl": 0.04827880859375, + "learning_rate": 3.221030765387417e-07, + "loss": 0.0266, + "reward": 4.0, + "reward_std": 1.7409893572330475, + "rewards/accuracy_reward_staging": 0.4375, + "rewards/format_reward": 0.90625, + "rewards/format_reward_staging": 0.90625, + "step": 254 + }, + { + "clip_ratio": 0.0, + "completion_length": 625.125, + "epoch": 63.666666666666664, + "grad_norm": 1.4981329871397806, + "kl": 0.04962158203125, + "learning_rate": 3.1824601471808497e-07, + "loss": 0.0841, + "reward": 4.5625, + "reward_std": 3.0762142539024353, + "rewards/accuracy_reward_staging": 0.5625, + "rewards/format_reward": 0.8125, + "rewards/format_reward_staging": 0.9375, + "step": 255 + }, + { + "clip_ratio": 0.0, + "completion_length": 633.125, + "epoch": 63.888888888888886, + "grad_norm": 1.4534244133457102, + "kl": 0.04656982421875, + "learning_rate": 3.1440137554088953e-07, + "loss": 0.029, + "reward": 3.84375, + "reward_std": 2.296931117773056, + "rewards/accuracy_reward_staging": 0.375, + "rewards/format_reward": 0.96875, + "rewards/format_reward_staging": 1.0, + "step": 256 + }, + { + "clip_ratio": 0.0, + "completion_length": 519.59375, + "epoch": 64.22222222222223, + "grad_norm": 1.586114819510263, + "kl": 0.05950927734375, + "learning_rate": 3.1056942178313604e-07, + "loss": 0.0666, + "reward": 4.375, + "reward_std": 2.7632179856300354, + "rewards/accuracy_reward_staging": 0.5, + "rewards/format_reward": 0.9375, + "rewards/format_reward_staging": 0.9375, + "step": 257 + }, + { + "clip_ratio": 0.0, + "completion_length": 554.5, + "epoch": 64.44444444444444, + "grad_norm": 1.4820895514814123, + "kl": 0.057373046875, + "learning_rate": 3.06750415353774e-07, + "loss": 0.015, + "reward": 4.34375, + "reward_std": 2.6667675375938416, + "rewards/accuracy_reward_staging": 0.46875, + "rewards/format_reward": 1.0, + "rewards/format_reward_staging": 1.0, + "step": 258 + }, + { + "clip_ratio": 0.0, + "completion_length": 594.3125, + "epoch": 64.66666666666667, + "grad_norm": 1.4710703551980364, + "kl": 0.05108642578125, + "learning_rate": 3.029446172768193e-07, + "loss": -0.0532, + "reward": 3.71875, + "reward_std": 1.9592358469963074, + "rewards/accuracy_reward_staging": 0.375, + "rewards/format_reward": 0.875, + "rewards/format_reward_staging": 0.96875, + "step": 259 + }, + { + "clip_ratio": 0.0, + "completion_length": 615.125, + "epoch": 64.88888888888889, + "grad_norm": 1.195494273136427, + "kl": 0.05078125, + "learning_rate": 2.9915228767351535e-07, + "loss": -0.0471, + "reward": 3.71875, + "reward_std": 1.5842358469963074, + "rewards/accuracy_reward_staging": 0.34375, + "rewards/format_reward": 1.0, + "rewards/format_reward_staging": 1.0, + "step": 260 + }, + { + "clip_ratio": 0.0, + "completion_length": 582.78125, + "epoch": 65.22222222222223, + "grad_norm": 1.0142470745003664, + "kl": 0.0562744140625, + "learning_rate": 2.9537368574455303e-07, + "loss": 0.0116, + "reward": 3.90625, + "reward_std": 1.3764855861663818, + "rewards/accuracy_reward_staging": 0.40625, + "rewards/format_reward": 0.90625, + "rewards/format_reward_staging": 0.96875, + "step": 261 + }, + { + "clip_ratio": 0.0, + "completion_length": 551.5625, + "epoch": 65.44444444444444, + "grad_norm": 1.2621208103940496, + "kl": 0.045166015625, + "learning_rate": 2.916090697523549e-07, + "loss": 0.0065, + "reward": 3.5625, + "reward_std": 1.8217839002609253, + "rewards/accuracy_reward_staging": 0.3125, + "rewards/format_reward": 1.0, + "rewards/format_reward_staging": 1.0, + "step": 262 + }, + { + "clip_ratio": 0.0, + "completion_length": 639.625, + "epoch": 65.66666666666667, + "grad_norm": 1.2178177535688726, + "kl": 0.06982421875, + "learning_rate": 2.878586970034232e-07, + "loss": 0.0063, + "reward": 2.8125, + "reward_std": 1.2878219783306122, + "rewards/accuracy_reward_staging": 0.21875, + "rewards/format_reward": 0.84375, + "rewards/format_reward_staging": 0.875, + "step": 263 + }, + { + "clip_ratio": 0.0, + "completion_length": 582.875, + "epoch": 65.88888888888889, + "grad_norm": 1.6224844954097977, + "kl": 0.04864501953125, + "learning_rate": 2.841228238307536e-07, + "loss": -0.0201, + "reward": 5.03125, + "reward_std": 2.2327269315719604, + "rewards/accuracy_reward_staging": 0.625, + "rewards/format_reward": 0.9375, + "rewards/format_reward_staging": 0.96875, + "step": 264 + }, + { + "clip_ratio": 0.0, + "completion_length": 637.15625, + "epoch": 66.22222222222223, + "grad_norm": 1.2308430687264216, + "kl": 0.05279541015625, + "learning_rate": 2.8040170557631485e-07, + "loss": 0.0153, + "reward": 3.46875, + "reward_std": 2.0372338593006134, + "rewards/accuracy_reward_staging": 0.3125, + "rewards/format_reward": 0.9375, + "rewards/format_reward_staging": 0.96875, + "step": 265 + }, + { + "clip_ratio": 0.0, + "completion_length": 600.4375, + "epoch": 66.44444444444444, + "grad_norm": 1.5321111568180714, + "kl": 0.04827880859375, + "learning_rate": 2.7669559657359673e-07, + "loss": -0.0491, + "reward": 3.8125, + "reward_std": 2.4646694660186768, + "rewards/accuracy_reward_staging": 0.375, + "rewards/format_reward": 0.9375, + "rewards/format_reward_staging": 1.0, + "step": 266 + }, + { + "clip_ratio": 0.0, + "completion_length": 615.5, + "epoch": 66.66666666666667, + "grad_norm": 1.3679824700888612, + "kl": 0.05462646484375, + "learning_rate": 2.730047501302266e-07, + "loss": 0.0308, + "reward": 3.09375, + "reward_std": 2.642750769853592, + "rewards/accuracy_reward_staging": 0.28125, + "rewards/format_reward": 0.78125, + "rewards/format_reward_staging": 0.90625, + "step": 267 + }, + { + "clip_ratio": 0.0, + "completion_length": 549.5, + "epoch": 66.88888888888889, + "grad_norm": 1.5033503223897624, + "kl": 0.0577392578125, + "learning_rate": 2.6932941851065615e-07, + "loss": -0.0215, + "reward": 4.9375, + "reward_std": 2.482675850391388, + "rewards/accuracy_reward_staging": 0.59375, + "rewards/format_reward": 0.96875, + "rewards/format_reward_staging": 1.0, + "step": 268 + }, + { + "clip_ratio": 0.0, + "completion_length": 553.34375, + "epoch": 67.22222222222223, + "grad_norm": 1.4290897914516596, + "kl": 0.05340576171875, + "learning_rate": 2.656698529189193e-07, + "loss": 0.0366, + "reward": 3.78125, + "reward_std": 1.9895031452178955, + "rewards/accuracy_reward_staging": 0.375, + "rewards/format_reward": 0.90625, + "rewards/format_reward_staging": 1.0, + "step": 269 + }, + { + "clip_ratio": 0.0, + "completion_length": 575.1875, + "epoch": 67.44444444444444, + "grad_norm": 1.579184850033335, + "kl": 0.0518798828125, + "learning_rate": 2.620263034814632e-07, + "loss": 0.0078, + "reward": 4.4375, + "reward_std": 2.323539137840271, + "rewards/accuracy_reward_staging": 0.53125, + "rewards/format_reward": 0.875, + "rewards/format_reward_staging": 0.90625, + "step": 270 + }, + { + "clip_ratio": 0.0, + "completion_length": 584.34375, + "epoch": 67.66666666666667, + "grad_norm": 1.482511162141472, + "kl": 0.0482177734375, + "learning_rate": 2.58399019230052e-07, + "loss": -0.0587, + "reward": 3.6875, + "reward_std": 2.195499747991562, + "rewards/accuracy_reward_staging": 0.34375, + "rewards/format_reward": 0.96875, + "rewards/format_reward_staging": 1.0, + "step": 271 + }, + { + "clip_ratio": 0.0, + "completion_length": 592.0, + "epoch": 67.88888888888889, + "grad_norm": 1.4652114217200525, + "kl": 0.049560546875, + "learning_rate": 2.547882480847461e-07, + "loss": 0.0021, + "reward": 3.1875, + "reward_std": 2.073539137840271, + "rewards/accuracy_reward_staging": 0.25, + "rewards/format_reward": 0.96875, + "rewards/format_reward_staging": 0.96875, + "step": 272 + }, + { + "clip_ratio": 0.0, + "completion_length": 598.59375, + "epoch": 68.22222222222223, + "grad_norm": 1.5345326135427537, + "kl": 0.04913330078125, + "learning_rate": 2.5119423683695657e-07, + "loss": -0.0357, + "reward": 4.25, + "reward_std": 2.9848236441612244, + "rewards/accuracy_reward_staging": 0.46875, + "rewards/format_reward": 0.9375, + "rewards/format_reward_staging": 0.96875, + "step": 273 + }, + { + "clip_ratio": 0.0, + "completion_length": 597.4375, + "epoch": 68.44444444444444, + "grad_norm": 1.5990838171502337, + "kl": 0.061279296875, + "learning_rate": 2.476172311325783e-07, + "loss": 0.0292, + "reward": 5.1875, + "reward_std": 2.957588255405426, + "rewards/accuracy_reward_staging": 0.6875, + "rewards/format_reward": 0.84375, + "rewards/format_reward_staging": 0.90625, + "step": 274 + }, + { + "clip_ratio": 0.0, + "completion_length": 632.71875, + "epoch": 68.66666666666667, + "grad_norm": 2.3340038254897566, + "kl": 0.06951904296875, + "learning_rate": 2.440574754551996e-07, + "loss": 0.0246, + "reward": 3.5, + "reward_std": 2.0238241851329803, + "rewards/accuracy_reward_staging": 0.3125, + "rewards/format_reward": 0.9375, + "rewards/format_reward_staging": 1.0, + "step": 275 + }, + { + "clip_ratio": 0.0, + "completion_length": 648.0625, + "epoch": 68.88888888888889, + "grad_norm": 1.5884432300379054, + "kl": 0.04443359375, + "learning_rate": 2.4051521310939254e-07, + "loss": 0.1177, + "reward": 4.0, + "reward_std": 1.8069141209125519, + "rewards/accuracy_reward_staging": 0.4375, + "rewards/format_reward": 0.90625, + "rewards/format_reward_staging": 0.90625, + "step": 276 + }, + { + "clip_ratio": 0.0, + "completion_length": 546.625, + "epoch": 69.22222222222223, + "grad_norm": 2.985922020759926, + "kl": 0.10992431640625, + "learning_rate": 2.3699068620408301e-07, + "loss": 0.0152, + "reward": 3.15625, + "reward_std": 1.511039137840271, + "rewards/accuracy_reward_staging": 0.25, + "rewards/format_reward": 0.90625, + "rewards/format_reward_staging": 1.0, + "step": 277 + }, + { + "clip_ratio": 0.0, + "completion_length": 602.25, + "epoch": 69.44444444444444, + "grad_norm": 1.5923870878518707, + "kl": 0.056396484375, + "learning_rate": 2.3348413563600323e-07, + "loss": 0.0176, + "reward": 4.5, + "reward_std": 2.31710484623909, + "rewards/accuracy_reward_staging": 0.53125, + "rewards/format_reward": 0.875, + "rewards/format_reward_staging": 0.96875, + "step": 278 + }, + { + "clip_ratio": 0.0, + "completion_length": 598.46875, + "epoch": 69.66666666666667, + "grad_norm": 1.5951806343259411, + "kl": 0.04925537109375, + "learning_rate": 2.2999580107322654e-07, + "loss": 0.0929, + "reward": 4.9375, + "reward_std": 2.494741439819336, + "rewards/accuracy_reward_staging": 0.59375, + "rewards/format_reward": 0.96875, + "rewards/format_reward_staging": 1.0, + "step": 279 + }, + { + "clip_ratio": 0.0, + "completion_length": 520.6875, + "epoch": 69.88888888888889, + "grad_norm": 1.5129561001363085, + "kl": 0.0699462890625, + "learning_rate": 2.2652592093878665e-07, + "loss": 0.0125, + "reward": 4.25, + "reward_std": 1.878759890794754, + "rewards/accuracy_reward_staging": 0.5, + "rewards/format_reward": 0.8125, + "rewards/format_reward_staging": 0.9375, + "step": 280 + }, + { + "clip_ratio": 0.0, + "completion_length": 645.0, + "epoch": 70.22222222222223, + "grad_norm": 1.5182810808110982, + "kl": 0.0643310546875, + "learning_rate": 2.2307473239438152e-07, + "loss": 0.01, + "reward": 4.40625, + "reward_std": 2.5910332798957825, + "rewards/accuracy_reward_staging": 0.5, + "rewards/format_reward": 0.90625, + "rewards/format_reward_staging": 1.0, + "step": 281 + }, + { + "clip_ratio": 0.0, + "completion_length": 569.21875, + "epoch": 70.44444444444444, + "grad_norm": 1.8382342741040079, + "kl": 0.05499267578125, + "learning_rate": 2.1964247132416368e-07, + "loss": 0.0019, + "reward": 4.40625, + "reward_std": 3.0214737951755524, + "rewards/accuracy_reward_staging": 0.5625, + "rewards/format_reward": 0.6875, + "rewards/format_reward_staging": 0.90625, + "step": 282 + }, + { + "clip_ratio": 0.0, + "completion_length": 608.0, + "epoch": 70.66666666666667, + "grad_norm": 1.7202842060510593, + "kl": 0.04736328125, + "learning_rate": 2.1622937231861822e-07, + "loss": 0.0307, + "reward": 3.375, + "reward_std": 2.42453271150589, + "rewards/accuracy_reward_staging": 0.3125, + "rewards/format_reward": 0.9375, + "rewards/format_reward_staging": 0.875, + "step": 283 + }, + { + "clip_ratio": 0.0, + "completion_length": 600.9375, + "epoch": 70.88888888888889, + "grad_norm": 1.4517912073118557, + "kl": 0.04290771484375, + "learning_rate": 2.128356686585282e-07, + "loss": 0.0476, + "reward": 3.75, + "reward_std": 1.7858919501304626, + "rewards/accuracy_reward_staging": 0.375, + "rewards/format_reward": 0.9375, + "rewards/format_reward_staging": 0.9375, + "step": 284 + }, + { + "clip_ratio": 0.0, + "completion_length": 649.53125, + "epoch": 71.22222222222223, + "grad_norm": 1.259142583692514, + "kl": 0.0467529296875, + "learning_rate": 2.0946159229903088e-07, + "loss": 0.0839, + "reward": 2.84375, + "reward_std": 1.5846085250377655, + "rewards/accuracy_reward_staging": 0.21875, + "rewards/format_reward": 0.875, + "rewards/format_reward_staging": 0.875, + "step": 285 + }, + { + "clip_ratio": 0.0, + "completion_length": 625.875, + "epoch": 71.44444444444444, + "grad_norm": 1.340679975407133, + "kl": 0.0565185546875, + "learning_rate": 2.0610737385376348e-07, + "loss": 0.0085, + "reward": 3.59375, + "reward_std": 1.975972980260849, + "rewards/accuracy_reward_staging": 0.375, + "rewards/format_reward": 0.84375, + "rewards/format_reward_staging": 0.875, + "step": 286 + }, + { + "clip_ratio": 0.0, + "completion_length": 603.34375, + "epoch": 71.66666666666667, + "grad_norm": 1.4289962866267603, + "kl": 0.06005859375, + "learning_rate": 2.0277324257910106e-07, + "loss": 0.0185, + "reward": 5.5, + "reward_std": 2.4536279439926147, + "rewards/accuracy_reward_staging": 0.75, + "rewards/format_reward": 0.8125, + "rewards/format_reward_staging": 0.9375, + "step": 287 + }, + { + "clip_ratio": 0.0, + "completion_length": 558.25, + "epoch": 71.88888888888889, + "grad_norm": 1.6936868571901433, + "kl": 0.0546875, + "learning_rate": 1.9945942635848745e-07, + "loss": 0.0145, + "reward": 3.78125, + "reward_std": 2.1591877937316895, + "rewards/accuracy_reward_staging": 0.375, + "rewards/format_reward": 0.9375, + "rewards/format_reward_staging": 0.96875, + "step": 288 + }, + { + "clip_ratio": 0.0, + "completion_length": 636.28125, + "epoch": 72.22222222222223, + "grad_norm": 1.3457751552584203, + "kl": 0.0472412109375, + "learning_rate": 1.9616615168685942e-07, + "loss": 0.0082, + "reward": 3.375, + "reward_std": 1.7216877937316895, + "rewards/accuracy_reward_staging": 0.28125, + "rewards/format_reward": 0.96875, + "rewards/format_reward_staging": 1.0, + "step": 289 + }, + { + "clip_ratio": 0.0, + "completion_length": 565.1875, + "epoch": 72.44444444444444, + "grad_norm": 1.1708504647450504, + "kl": 0.0599365234375, + "learning_rate": 1.9289364365516607e-07, + "loss": 0.015, + "reward": 4.46875, + "reward_std": 1.2958193719387054, + "rewards/accuracy_reward_staging": 0.53125, + "rewards/format_reward": 0.875, + "rewards/format_reward_staging": 0.9375, + "step": 290 + }, + { + "clip_ratio": 0.0, + "completion_length": 597.125, + "epoch": 72.66666666666667, + "grad_norm": 2.00516555564966, + "kl": 0.065185546875, + "learning_rate": 1.896421259349844e-07, + "loss": 0.0357, + "reward": 4.21875, + "reward_std": 2.589491307735443, + "rewards/accuracy_reward_staging": 0.46875, + "rewards/format_reward": 0.90625, + "rewards/format_reward_staging": 0.96875, + "step": 291 + }, + { + "clip_ratio": 0.0, + "completion_length": 672.75, + "epoch": 72.88888888888889, + "grad_norm": 2.8218244852832335, + "kl": 0.09649658203125, + "learning_rate": 1.8641182076323148e-07, + "loss": -0.0058, + "reward": 5.03125, + "reward_std": 3.2576534748077393, + "rewards/accuracy_reward_staging": 0.625, + "rewards/format_reward": 0.9375, + "rewards/format_reward_staging": 0.96875, + "step": 292 + }, + { + "clip_ratio": 0.0, + "completion_length": 577.65625, + "epoch": 73.22222222222223, + "grad_norm": 1.7581055322994823, + "kl": 0.06195068359375, + "learning_rate": 1.8320294892697475e-07, + "loss": 0.0534, + "reward": 3.0, + "reward_std": 2.1200742721557617, + "rewards/accuracy_reward_staging": 0.28125, + "rewards/format_reward": 0.75, + "rewards/format_reward_staging": 0.84375, + "step": 293 + }, + { + "clip_ratio": 0.0, + "completion_length": 562.6875, + "epoch": 73.44444444444444, + "grad_norm": 1.5050675135024016, + "kl": 0.0499267578125, + "learning_rate": 1.8001572974834168e-07, + "loss": 0.0343, + "reward": 4.0, + "reward_std": 1.9108919501304626, + "rewards/accuracy_reward_staging": 0.40625, + "rewards/format_reward": 0.96875, + "rewards/format_reward_staging": 1.0, + "step": 294 + }, + { + "clip_ratio": 0.0, + "completion_length": 590.5, + "epoch": 73.66666666666667, + "grad_norm": 12.895158631725321, + "kl": 0.12432861328125, + "learning_rate": 1.768503810695295e-07, + "loss": 0.0513, + "reward": 3.46875, + "reward_std": 1.6672459840774536, + "rewards/accuracy_reward_staging": 0.3125, + "rewards/format_reward": 0.90625, + "rewards/format_reward_staging": 1.0, + "step": 295 + }, + { + "clip_ratio": 0.0, + "completion_length": 609.125, + "epoch": 73.88888888888889, + "grad_norm": 1.7151482870021748, + "kl": 0.07269287109375, + "learning_rate": 1.7370711923791564e-07, + "loss": -0.0106, + "reward": 5.625, + "reward_std": 2.8527393341064453, + "rewards/accuracy_reward_staging": 0.78125, + "rewards/format_reward": 0.84375, + "rewards/format_reward_staging": 0.875, + "step": 296 + }, + { + "clip_ratio": 0.0, + "completion_length": 675.34375, + "epoch": 74.22222222222223, + "grad_norm": 1.6055158333490096, + "kl": 0.0538330078125, + "learning_rate": 1.70586159091271e-07, + "loss": 0.0916, + "reward": 3.53125, + "reward_std": 2.737855911254883, + "rewards/accuracy_reward_staging": 0.40625, + "rewards/format_reward": 0.6875, + "rewards/format_reward_staging": 0.8125, + "step": 297 + }, + { + "clip_ratio": 0.0, + "completion_length": 638.0, + "epoch": 74.44444444444444, + "grad_norm": 1.3927154732757459, + "kl": 0.0494384765625, + "learning_rate": 1.674877139430758e-07, + "loss": -0.0039, + "reward": 3.5, + "reward_std": 2.132579743862152, + "rewards/accuracy_reward_staging": 0.3125, + "rewards/format_reward": 0.96875, + "rewards/format_reward_staging": 0.96875, + "step": 298 + }, + { + "clip_ratio": 0.0, + "completion_length": 597.78125, + "epoch": 74.66666666666667, + "grad_norm": 1.2941317675033293, + "kl": 0.05804443359375, + "learning_rate": 1.6441199556794034e-07, + "loss": 0.0582, + "reward": 3.28125, + "reward_std": 2.0324151515960693, + "rewards/accuracy_reward_staging": 0.28125, + "rewards/format_reward": 0.90625, + "rewards/format_reward_staging": 0.96875, + "step": 299 + }, + { + "epoch": 74.88888888888889, + "grad_norm": 1.1895166775660873, + "learning_rate": 1.6135921418712955e-07, + "loss": 0.0154, + "step": 300 + }, + { + "epoch": 74.88888888888889, + "eval_clip_ratio": 0.0, + "eval_completion_length": 558.5, + "eval_kl": 0.055908203125, + "eval_loss": 0.04830198734998703, + "eval_reward": 3.25, + "eval_reward_std": 2.227747082710266, + "eval_rewards/accuracy_reward_staging": 0.275, + "eval_rewards/format_reward": 0.9, + "eval_rewards/format_reward_staging": 0.975, + "eval_runtime": 50.8525, + "eval_samples_per_second": 0.708, + "eval_steps_per_second": 0.098, + "step": 300 + }, + { + "clip_ratio": 0.0, + "completion_length": 522.46875, + "epoch": 75.22222222222223, + "grad_norm": 1.2719829976418617, + "kl": 0.05950927734375, + "learning_rate": 1.5832957845419582e-07, + "loss": -0.0239, + "reward": 4.078125, + "reward_std": 1.734619602560997, + "rewards/accuracy_reward_staging": 0.4375, + "rewards/format_reward": 0.921875, + "rewards/format_reward_staging": 0.96875, + "step": 301 + }, + { + "clip_ratio": 0.0, + "completion_length": 531.75, + "epoch": 75.44444444444444, + "grad_norm": 1.4701942608061176, + "kl": 0.05584716796875, + "learning_rate": 1.553232954407171e-07, + "loss": -0.0222, + "reward": 4.46875, + "reward_std": 1.8445461988449097, + "rewards/accuracy_reward_staging": 0.5, + "rewards/format_reward": 0.96875, + "rewards/format_reward_staging": 1.0, + "step": 302 + }, + { + "clip_ratio": 0.0, + "completion_length": 606.6875, + "epoch": 75.66666666666667, + "grad_norm": 0.979841248868734, + "kl": 0.0506591796875, + "learning_rate": 1.52340570622144e-07, + "loss": 0.0094, + "reward": 4.34375, + "reward_std": 1.0341877937316895, + "rewards/accuracy_reward_staging": 0.46875, + "rewards/format_reward": 1.0, + "rewards/format_reward_staging": 1.0, + "step": 303 + }, + { + "clip_ratio": 0.0, + "completion_length": 645.0, + "epoch": 75.88888888888889, + "grad_norm": 1.2907279139619887, + "kl": 0.05084228515625, + "learning_rate": 1.493816078637557e-07, + "loss": 0.0349, + "reward": 4.03125, + "reward_std": 2.768365204334259, + "rewards/accuracy_reward_staging": 0.4375, + "rewards/format_reward": 0.90625, + "rewards/format_reward_staging": 0.9375, + "step": 304 + }, + { + "clip_ratio": 0.0, + "completion_length": 612.03125, + "epoch": 76.22222222222223, + "grad_norm": 1.3052082852261886, + "kl": 0.06219482421875, + "learning_rate": 1.4644660940672627e-07, + "loss": 0.0241, + "reward": 3.90625, + "reward_std": 1.5625, + "rewards/accuracy_reward_staging": 0.40625, + "rewards/format_reward": 0.875, + "rewards/format_reward_staging": 1.0, + "step": 305 + }, + { + "clip_ratio": 0.0, + "completion_length": 552.0, + "epoch": 76.44444444444444, + "grad_norm": 1.6816507380806482, + "kl": 0.0640869140625, + "learning_rate": 1.435357758543015e-07, + "loss": 0.0623, + "reward": 3.5, + "reward_std": 2.3343209326267242, + "rewards/accuracy_reward_staging": 0.34375, + "rewards/format_reward": 0.875, + "rewards/format_reward_staging": 0.90625, + "step": 306 + }, + { + "clip_ratio": 0.0, + "completion_length": 605.96875, + "epoch": 76.66666666666667, + "grad_norm": 1.7963549332670843, + "kl": 0.05462646484375, + "learning_rate": 1.4064930615808806e-07, + "loss": -0.0141, + "reward": 3.90625, + "reward_std": 3.359531879425049, + "rewards/accuracy_reward_staging": 0.40625, + "rewards/format_reward": 0.90625, + "rewards/format_reward_staging": 0.96875, + "step": 307 + }, + { + "clip_ratio": 0.0, + "completion_length": 559.5625, + "epoch": 76.88888888888889, + "grad_norm": 1.3270350222684457, + "kl": 0.0548095703125, + "learning_rate": 1.3778739760445552e-07, + "loss": 0.0232, + "reward": 3.53125, + "reward_std": 2.031329423189163, + "rewards/accuracy_reward_staging": 0.34375, + "rewards/format_reward": 0.875, + "rewards/format_reward_staging": 0.9375, + "step": 308 + }, + { + "clip_ratio": 0.0, + "completion_length": 574.9375, + "epoch": 77.22222222222223, + "grad_norm": 1.403581677625956, + "kl": 0.0579833984375, + "learning_rate": 1.349502458010519e-07, + "loss": 0.0045, + "reward": 3.40625, + "reward_std": 1.5280899405479431, + "rewards/accuracy_reward_staging": 0.3125, + "rewards/format_reward": 0.90625, + "rewards/format_reward_staging": 0.9375, + "step": 309 + }, + { + "clip_ratio": 0.0, + "completion_length": 571.4375, + "epoch": 77.44444444444444, + "grad_norm": 1.4518085181139868, + "kl": 0.05694580078125, + "learning_rate": 1.321380446634342e-07, + "loss": -0.0332, + "reward": 4.53125, + "reward_std": 2.796904981136322, + "rewards/accuracy_reward_staging": 0.5625, + "rewards/format_reward": 0.8125, + "rewards/format_reward_staging": 0.90625, + "step": 310 + }, + { + "clip_ratio": 0.0, + "completion_length": 592.375, + "epoch": 77.66666666666667, + "grad_norm": 1.4193852483613092, + "kl": 0.04937744140625, + "learning_rate": 1.2935098640181457e-07, + "loss": 0.0097, + "reward": 3.71875, + "reward_std": 1.6591877937316895, + "rewards/accuracy_reward_staging": 0.34375, + "rewards/format_reward": 1.0, + "rewards/format_reward_staging": 1.0, + "step": 311 + }, + { + "clip_ratio": 0.0, + "completion_length": 576.5625, + "epoch": 77.88888888888889, + "grad_norm": 1.5563017115814217, + "kl": 0.055419921875, + "learning_rate": 1.2658926150792322e-07, + "loss": 0.0595, + "reward": 4.03125, + "reward_std": 2.8135814666748047, + "rewards/accuracy_reward_staging": 0.4375, + "rewards/format_reward": 0.875, + "rewards/format_reward_staging": 0.96875, + "step": 312 + }, + { + "clip_ratio": 0.0, + "completion_length": 618.15625, + "epoch": 78.22222222222223, + "grad_norm": 1.7591845192775064, + "kl": 0.05523681640625, + "learning_rate": 1.2385305874198775e-07, + "loss": -0.0554, + "reward": 2.625, + "reward_std": 1.8215623199939728, + "rewards/accuracy_reward_staging": 0.1875, + "rewards/format_reward": 0.78125, + "rewards/format_reward_staging": 0.90625, + "step": 313 + }, + { + "clip_ratio": 0.0, + "completion_length": 554.40625, + "epoch": 78.44444444444444, + "grad_norm": 1.532211221198721, + "kl": 0.04986572265625, + "learning_rate": 1.2114256511983274e-07, + "loss": 0.0323, + "reward": 4.40625, + "reward_std": 2.975598633289337, + "rewards/accuracy_reward_staging": 0.5, + "rewards/format_reward": 0.9375, + "rewards/format_reward_staging": 0.96875, + "step": 314 + }, + { + "clip_ratio": 0.0, + "completion_length": 563.71875, + "epoch": 78.66666666666667, + "grad_norm": 1.827273461906554, + "kl": 0.054931640625, + "learning_rate": 1.1845796590009683e-07, + "loss": 0.1089, + "reward": 4.28125, + "reward_std": 2.9560980796813965, + "rewards/accuracy_reward_staging": 0.5, + "rewards/format_reward": 0.875, + "rewards/format_reward_staging": 0.90625, + "step": 315 + }, + { + "clip_ratio": 0.0, + "completion_length": 607.125, + "epoch": 78.88888888888889, + "grad_norm": 1.5705329008308428, + "kl": 0.05218505859375, + "learning_rate": 1.1579944457157059e-07, + "loss": 0.0714, + "reward": 3.53125, + "reward_std": 2.3595376014709473, + "rewards/accuracy_reward_staging": 0.375, + "rewards/format_reward": 0.8125, + "rewards/format_reward_staging": 0.84375, + "step": 316 + }, + { + "clip_ratio": 0.0, + "completion_length": 576.59375, + "epoch": 79.22222222222223, + "grad_norm": 1.2784333227971698, + "kl": 0.04998779296875, + "learning_rate": 1.1316718284065535e-07, + "loss": -0.0327, + "reward": 3.1875, + "reward_std": 1.75, + "rewards/accuracy_reward_staging": 0.25, + "rewards/format_reward": 0.96875, + "rewards/format_reward_staging": 0.96875, + "step": 317 + }, + { + "clip_ratio": 0.0, + "completion_length": 598.6875, + "epoch": 79.44444444444444, + "grad_norm": 1.2996658268690655, + "kl": 0.05255126953125, + "learning_rate": 1.1056136061894384e-07, + "loss": -0.0387, + "reward": 4.5, + "reward_std": 1.6467358469963074, + "rewards/accuracy_reward_staging": 0.53125, + "rewards/format_reward": 0.875, + "rewards/format_reward_staging": 0.96875, + "step": 318 + }, + { + "clip_ratio": 0.0, + "completion_length": 576.25, + "epoch": 79.66666666666667, + "grad_norm": 1.5574614421463486, + "kl": 0.04937744140625, + "learning_rate": 1.0798215601092353e-07, + "loss": 0.0303, + "reward": 4.375, + "reward_std": 2.325068473815918, + "rewards/accuracy_reward_staging": 0.5, + "rewards/format_reward": 0.90625, + "rewards/format_reward_staging": 0.96875, + "step": 319 + }, + { + "clip_ratio": 0.0, + "completion_length": 626.0625, + "epoch": 79.88888888888889, + "grad_norm": 1.5754896085780978, + "kl": 0.0546875, + "learning_rate": 1.0542974530180327e-07, + "loss": 0.0137, + "reward": 4.1875, + "reward_std": 2.3722406029701233, + "rewards/accuracy_reward_staging": 0.46875, + "rewards/format_reward": 0.90625, + "rewards/format_reward_staging": 0.9375, + "step": 320 + }, + { + "clip_ratio": 0.0, + "completion_length": 550.71875, + "epoch": 80.22222222222223, + "grad_norm": 1.610726394417215, + "kl": 0.06024169921875, + "learning_rate": 1.0290430294546448e-07, + "loss": 0.013, + "reward": 3.90625, + "reward_std": 2.4335986375808716, + "rewards/accuracy_reward_staging": 0.40625, + "rewards/format_reward": 0.9375, + "rewards/format_reward_staging": 0.9375, + "step": 321 + }, + { + "clip_ratio": 0.0, + "completion_length": 655.25, + "epoch": 80.44444444444444, + "grad_norm": 1.4911116339078845, + "kl": 0.0523681640625, + "learning_rate": 1.0040600155253764e-07, + "loss": 0.0332, + "reward": 2.78125, + "reward_std": 1.3004322350025177, + "rewards/accuracy_reward_staging": 0.1875, + "rewards/format_reward": 0.875, + "rewards/format_reward_staging": 0.96875, + "step": 322 + }, + { + "clip_ratio": 0.0, + "completion_length": 519.3125, + "epoch": 80.66666666666667, + "grad_norm": 1.6907992973095978, + "kl": 0.0538330078125, + "learning_rate": 9.793501187860431e-08, + "loss": -0.0401, + "reward": 4.0, + "reward_std": 2.362515449523926, + "rewards/accuracy_reward_staging": 0.40625, + "rewards/format_reward": 0.96875, + "rewards/format_reward_staging": 1.0, + "step": 323 + }, + { + "clip_ratio": 0.0, + "completion_length": 538.5, + "epoch": 80.88888888888889, + "grad_norm": 1.646427054544714, + "kl": 0.063232421875, + "learning_rate": 9.549150281252632e-08, + "loss": -0.0039, + "reward": 4.15625, + "reward_std": 2.2053900957107544, + "rewards/accuracy_reward_staging": 0.4375, + "rewards/format_reward": 0.96875, + "rewards/format_reward_staging": 1.0, + "step": 324 + }, + { + "clip_ratio": 0.0, + "completion_length": 562.03125, + "epoch": 81.22222222222223, + "grad_norm": 1.2401904323679076, + "kl": 0.059814453125, + "learning_rate": 9.307564136490254e-08, + "loss": 0.0337, + "reward": 2.6875, + "reward_std": 1.4073790609836578, + "rewards/accuracy_reward_staging": 0.15625, + "rewards/format_reward": 0.90625, + "rewards/format_reward_staging": 1.0, + "step": 325 + }, + { + "clip_ratio": 0.0, + "completion_length": 593.3125, + "epoch": 81.44444444444444, + "grad_norm": 1.5857599353606664, + "kl": 0.0521240234375, + "learning_rate": 9.068759265665382e-08, + "loss": 0.0031, + "reward": 3.46875, + "reward_std": 2.041439712047577, + "rewards/accuracy_reward_staging": 0.3125, + "rewards/format_reward": 0.9375, + "rewards/format_reward_staging": 0.96875, + "step": 326 + }, + { + "clip_ratio": 0.0, + "completion_length": 636.15625, + "epoch": 81.66666666666667, + "grad_norm": 1.464039724178544, + "kl": 0.04815673828125, + "learning_rate": 8.832751990773712e-08, + "loss": -0.033, + "reward": 4.0625, + "reward_std": 2.3850997388362885, + "rewards/accuracy_reward_staging": 0.4375, + "rewards/format_reward": 0.9375, + "rewards/format_reward_staging": 0.9375, + "step": 327 + }, + { + "clip_ratio": 0.0, + "completion_length": 550.625, + "epoch": 81.88888888888889, + "grad_norm": 1.546052526056493, + "kl": 0.05743408203125, + "learning_rate": 8.599558442598998e-08, + "loss": 0.0427, + "reward": 4.15625, + "reward_std": 2.8091025352478027, + "rewards/accuracy_reward_staging": 0.5, + "rewards/format_reward": 0.78125, + "rewards/format_reward_staging": 0.875, + "step": 328 + }, + { + "clip_ratio": 0.0, + "completion_length": 592.53125, + "epoch": 82.22222222222223, + "grad_norm": 1.359491111417973, + "kl": 0.05621337890625, + "learning_rate": 8.369194559610481e-08, + "loss": 0.0752, + "reward": 3.03125, + "reward_std": 1.4954701960086823, + "rewards/accuracy_reward_staging": 0.25, + "rewards/format_reward": 0.875, + "rewards/format_reward_staging": 0.90625, + "step": 329 + }, + { + "clip_ratio": 0.0, + "completion_length": 608.78125, + "epoch": 82.44444444444444, + "grad_norm": 1.4795564527051304, + "kl": 0.05267333984375, + "learning_rate": 8.141676086873573e-08, + "loss": 0.0759, + "reward": 3.28125, + "reward_std": 1.9649099707603455, + "rewards/accuracy_reward_staging": 0.28125, + "rewards/format_reward": 0.9375, + "rewards/format_reward_staging": 0.9375, + "step": 330 + }, + { + "clip_ratio": 0.0, + "completion_length": 583.0625, + "epoch": 82.66666666666667, + "grad_norm": 1.6241400838987166, + "kl": 0.0562744140625, + "learning_rate": 7.917018574973644e-08, + "loss": 0.0196, + "reward": 4.40625, + "reward_std": 2.2960872054100037, + "rewards/accuracy_reward_staging": 0.53125, + "rewards/format_reward": 0.78125, + "rewards/format_reward_staging": 0.96875, + "step": 331 + }, + { + "clip_ratio": 0.0, + "completion_length": 553.5, + "epoch": 82.88888888888889, + "grad_norm": 1.5339258798115873, + "kl": 0.0474853515625, + "learning_rate": 7.695237378953224e-08, + "loss": -0.0209, + "reward": 4.5, + "reward_std": 2.332531690597534, + "rewards/accuracy_reward_staging": 0.5, + "rewards/format_reward": 1.0, + "rewards/format_reward_staging": 1.0, + "step": 332 + }, + { + "clip_ratio": 0.0, + "completion_length": 588.34375, + "epoch": 83.22222222222223, + "grad_norm": 1.5601324006274968, + "kl": 0.05804443359375, + "learning_rate": 7.476347657262455e-08, + "loss": -0.039, + "reward": 4.71875, + "reward_std": 2.439529001712799, + "rewards/accuracy_reward_staging": 0.5625, + "rewards/format_reward": 0.9375, + "rewards/format_reward_staging": 0.96875, + "step": 333 + }, + { + "clip_ratio": 0.0, + "completion_length": 558.25, + "epoch": 83.44444444444444, + "grad_norm": 1.5625484764568953, + "kl": 0.05841064453125, + "learning_rate": 7.260364370723043e-08, + "loss": -0.0022, + "reward": 3.875, + "reward_std": 2.6049662828445435, + "rewards/accuracy_reward_staging": 0.40625, + "rewards/format_reward": 0.875, + "rewards/format_reward_staging": 0.96875, + "step": 334 + }, + { + "clip_ratio": 0.0, + "completion_length": 538.53125, + "epoch": 83.66666666666667, + "grad_norm": 1.6578933804792892, + "kl": 0.064697265625, + "learning_rate": 7.047302281505735e-08, + "loss": 0.0178, + "reward": 3.6875, + "reward_std": 1.93262779712677, + "rewards/accuracy_reward_staging": 0.34375, + "rewards/format_reward": 0.96875, + "rewards/format_reward_staging": 1.0, + "step": 335 + }, + { + "clip_ratio": 0.0, + "completion_length": 579.3125, + "epoch": 83.88888888888889, + "grad_norm": 1.737744709396854, + "kl": 0.05303955078125, + "learning_rate": 6.837175952121304e-08, + "loss": -0.056, + "reward": 3.875, + "reward_std": 2.5176164507865906, + "rewards/accuracy_reward_staging": 0.40625, + "rewards/format_reward": 0.875, + "rewards/format_reward_staging": 0.96875, + "step": 336 + }, + { + "clip_ratio": 0.0, + "completion_length": 559.25, + "epoch": 84.22222222222223, + "grad_norm": 1.5777065212148367, + "kl": 0.0582275390625, + "learning_rate": 6.629999744425235e-08, + "loss": -0.0542, + "reward": 3.3125, + "reward_std": 1.8360159397125244, + "rewards/accuracy_reward_staging": 0.3125, + "rewards/format_reward": 0.84375, + "rewards/format_reward_staging": 0.90625, + "step": 337 + }, + { + "clip_ratio": 0.0, + "completion_length": 604.03125, + "epoch": 84.44444444444444, + "grad_norm": 1.7153117337295096, + "kl": 0.05419921875, + "learning_rate": 6.42578781863613e-08, + "loss": 0.0782, + "reward": 3.625, + "reward_std": 3.0492074489593506, + "rewards/accuracy_reward_staging": 0.375, + "rewards/format_reward": 0.84375, + "rewards/format_reward_staging": 0.90625, + "step": 338 + }, + { + "clip_ratio": 0.0, + "completion_length": 591.3125, + "epoch": 84.66666666666667, + "grad_norm": 1.3347688495066687, + "kl": 0.053466796875, + "learning_rate": 6.22455413236786e-08, + "loss": -0.0014, + "reward": 3.03125, + "reward_std": 1.389709249138832, + "rewards/accuracy_reward_staging": 0.25, + "rewards/format_reward": 0.875, + "rewards/format_reward_staging": 0.90625, + "step": 339 + }, + { + "clip_ratio": 0.0, + "completion_length": 633.6875, + "epoch": 84.88888888888889, + "grad_norm": 1.389754239090955, + "kl": 0.04827880859375, + "learning_rate": 6.026312439675551e-08, + "loss": 0.0256, + "reward": 4.25, + "reward_std": 2.171033263206482, + "rewards/accuracy_reward_staging": 0.46875, + "rewards/format_reward": 0.9375, + "rewards/format_reward_staging": 0.96875, + "step": 340 + }, + { + "clip_ratio": 0.0, + "completion_length": 550.59375, + "epoch": 85.22222222222223, + "grad_norm": 1.7000896266286212, + "kl": 0.06982421875, + "learning_rate": 5.831076290115572e-08, + "loss": 0.0243, + "reward": 4.15625, + "reward_std": 2.343973159790039, + "rewards/accuracy_reward_staging": 0.46875, + "rewards/format_reward": 0.84375, + "rewards/format_reward_staging": 0.96875, + "step": 341 + }, + { + "clip_ratio": 0.0, + "completion_length": 600.8125, + "epoch": 85.44444444444444, + "grad_norm": 1.4833240972495387, + "kl": 0.056884765625, + "learning_rate": 5.638859027819409e-08, + "loss": 0.0553, + "reward": 3.53125, + "reward_std": 2.29950013756752, + "rewards/accuracy_reward_staging": 0.34375, + "rewards/format_reward": 0.875, + "rewards/format_reward_staging": 0.9375, + "step": 342 + }, + { + "clip_ratio": 0.0, + "completion_length": 650.875, + "epoch": 85.66666666666667, + "grad_norm": 1.3618290470634058, + "kl": 0.04998779296875, + "learning_rate": 5.44967379058161e-08, + "loss": -0.0017, + "reward": 5.0, + "reward_std": 2.3323360979557037, + "rewards/accuracy_reward_staging": 0.625, + "rewards/format_reward": 0.9375, + "rewards/format_reward_staging": 0.9375, + "step": 343 + }, + { + "clip_ratio": 0.0, + "completion_length": 576.75, + "epoch": 85.88888888888889, + "grad_norm": 1.749186308713559, + "kl": 0.05303955078125, + "learning_rate": 5.263533508961826e-08, + "loss": 0.0794, + "reward": 3.34375, + "reward_std": 2.082039564847946, + "rewards/accuracy_reward_staging": 0.3125, + "rewards/format_reward": 0.84375, + "rewards/format_reward_staging": 0.9375, + "step": 344 + }, + { + "clip_ratio": 0.0, + "completion_length": 573.3125, + "epoch": 86.22222222222223, + "grad_norm": 1.4322968854479468, + "kl": 0.05615234375, + "learning_rate": 5.080450905401057e-08, + "loss": 0.0153, + "reward": 4.25, + "reward_std": 1.8755539804697037, + "rewards/accuracy_reward_staging": 0.46875, + "rewards/format_reward": 0.9375, + "rewards/format_reward_staging": 0.96875, + "step": 345 + }, + { + "clip_ratio": 0.0, + "completion_length": 546.59375, + "epoch": 86.44444444444444, + "grad_norm": 1.2682803454826486, + "kl": 0.053955078125, + "learning_rate": 4.9004384933520547e-08, + "loss": 0.0083, + "reward": 3.53125, + "reward_std": 1.3726893961429596, + "rewards/accuracy_reward_staging": 0.3125, + "rewards/format_reward": 0.96875, + "rewards/format_reward_staging": 1.0, + "step": 346 + }, + { + "clip_ratio": 0.0, + "completion_length": 580.21875, + "epoch": 86.66666666666667, + "grad_norm": 1.5693179766123742, + "kl": 0.05389404296875, + "learning_rate": 4.723508576424062e-08, + "loss": -0.0063, + "reward": 3.46875, + "reward_std": 2.777799040079117, + "rewards/accuracy_reward_staging": 0.34375, + "rewards/format_reward": 0.84375, + "rewards/format_reward_staging": 0.90625, + "step": 347 + }, + { + "clip_ratio": 0.0, + "completion_length": 595.375, + "epoch": 86.88888888888889, + "grad_norm": 1.6314212035379032, + "kl": 0.053955078125, + "learning_rate": 4.549673247541874e-08, + "loss": -0.01, + "reward": 4.15625, + "reward_std": 2.311874210834503, + "rewards/accuracy_reward_staging": 0.46875, + "rewards/format_reward": 0.875, + "rewards/format_reward_staging": 0.9375, + "step": 348 + }, + { + "clip_ratio": 0.0, + "completion_length": 568.71875, + "epoch": 87.22222222222223, + "grad_norm": 1.658360075079596, + "kl": 0.0572509765625, + "learning_rate": 4.37894438811931e-08, + "loss": 0.0064, + "reward": 3.5625, + "reward_std": 2.875, + "rewards/accuracy_reward_staging": 0.34375, + "rewards/format_reward": 0.84375, + "rewards/format_reward_staging": 1.0, + "step": 349 + }, + { + "epoch": 87.44444444444444, + "grad_norm": 1.4586421894209247, + "learning_rate": 4.2113336672471245e-08, + "loss": 0.0579, + "step": 350 + }, + { + "epoch": 87.44444444444444, + "eval_clip_ratio": 0.0, + "eval_completion_length": 609.05, + "eval_kl": 0.052783203125, + "eval_loss": 0.033656854182481766, + "eval_reward": 2.45, + "eval_reward_std": 1.6229771614074706, + "eval_rewards/accuracy_reward_staging": 0.15, + "eval_rewards/format_reward": 0.825, + "eval_rewards/format_reward_staging": 0.875, + "eval_runtime": 55.2193, + "eval_samples_per_second": 0.652, + "eval_steps_per_second": 0.091, + "step": 350 + }, + { + "clip_ratio": 0.0, + "completion_length": 609.578125, + "epoch": 87.66666666666667, + "grad_norm": 1.6263684105864271, + "kl": 0.0521240234375, + "learning_rate": 4.0468525408954456e-08, + "loss": 0.0832, + "reward": 4.265625, + "reward_std": 2.632143199443817, + "rewards/accuracy_reward_staging": 0.484375, + "rewards/format_reward": 0.90625, + "rewards/format_reward_staging": 0.9375, + "step": 351 + }, + { + "clip_ratio": 0.0, + "completion_length": 630.0, + "epoch": 87.88888888888889, + "grad_norm": 1.406310532139818, + "kl": 0.0509033203125, + "learning_rate": 3.8855122511307626e-08, + "loss": 0.0517, + "reward": 3.0625, + "reward_std": 1.3608438968658447, + "rewards/accuracy_reward_staging": 0.21875, + "rewards/format_reward": 0.96875, + "rewards/format_reward_staging": 1.0, + "step": 352 + }, + { + "clip_ratio": 0.0, + "completion_length": 592.46875, + "epoch": 88.22222222222223, + "grad_norm": 1.505022624138408, + "kl": 0.05255126953125, + "learning_rate": 3.727323825347578e-08, + "loss": 0.0469, + "reward": 4.25, + "reward_std": 2.023455113172531, + "rewards/accuracy_reward_staging": 0.46875, + "rewards/format_reward": 0.9375, + "rewards/format_reward_staging": 0.96875, + "step": 353 + }, + { + "clip_ratio": 0.0, + "completion_length": 570.21875, + "epoch": 88.44444444444444, + "grad_norm": 1.7398525080011868, + "kl": 0.051025390625, + "learning_rate": 3.572298075514652e-08, + "loss": 0.0079, + "reward": 5.25, + "reward_std": 2.496154248714447, + "rewards/accuracy_reward_staging": 0.65625, + "rewards/format_reward": 1.0, + "rewards/format_reward_staging": 0.96875, + "step": 354 + }, + { + "clip_ratio": 0.0, + "completion_length": 609.75, + "epoch": 88.66666666666667, + "grad_norm": 1.4985931737207225, + "kl": 0.05255126953125, + "learning_rate": 3.420445597436056e-08, + "loss": 0.0262, + "reward": 4.09375, + "reward_std": 2.1352776885032654, + "rewards/accuracy_reward_staging": 0.4375, + "rewards/format_reward": 0.9375, + "rewards/format_reward_staging": 0.96875, + "step": 355 + }, + { + "clip_ratio": 0.0, + "completion_length": 673.75, + "epoch": 88.88888888888889, + "grad_norm": 1.4906376909879282, + "kl": 0.05889892578125, + "learning_rate": 3.271776770026963e-08, + "loss": 0.0716, + "reward": 3.28125, + "reward_std": 2.086387515068054, + "rewards/accuracy_reward_staging": 0.3125, + "rewards/format_reward": 0.78125, + "rewards/format_reward_staging": 0.9375, + "step": 356 + }, + { + "clip_ratio": 0.0, + "completion_length": 549.53125, + "epoch": 89.22222222222223, + "grad_norm": 1.87310658244872, + "kl": 0.08197021484375, + "learning_rate": 3.1263017546042326e-08, + "loss": 0.0395, + "reward": 3.90625, + "reward_std": 2.444858193397522, + "rewards/accuracy_reward_staging": 0.40625, + "rewards/format_reward": 0.875, + "rewards/format_reward_staging": 1.0, + "step": 357 + }, + { + "clip_ratio": 0.0, + "completion_length": 570.34375, + "epoch": 89.44444444444444, + "grad_norm": 1.6047992153607105, + "kl": 0.05328369140625, + "learning_rate": 2.9840304941919416e-08, + "loss": 0.0128, + "reward": 4.09375, + "reward_std": 3.0098507404327393, + "rewards/accuracy_reward_staging": 0.4375, + "rewards/format_reward": 0.9375, + "rewards/format_reward_staging": 0.96875, + "step": 358 + }, + { + "clip_ratio": 0.0, + "completion_length": 588.09375, + "epoch": 89.66666666666667, + "grad_norm": 1.3977481210272724, + "kl": 0.0628662109375, + "learning_rate": 2.8449727128417367e-08, + "loss": 0.0184, + "reward": 3.6875, + "reward_std": 1.197430670261383, + "rewards/accuracy_reward_staging": 0.40625, + "rewards/format_reward": 0.78125, + "rewards/format_reward_staging": 0.875, + "step": 359 + }, + { + "clip_ratio": 0.0, + "completion_length": 663.0625, + "epoch": 89.88888888888889, + "grad_norm": 1.521856541607207, + "kl": 0.04925537109375, + "learning_rate": 2.7091379149682682e-08, + "loss": -0.0485, + "reward": 4.34375, + "reward_std": 2.930923640727997, + "rewards/accuracy_reward_staging": 0.46875, + "rewards/format_reward": 1.0, + "rewards/format_reward_staging": 1.0, + "step": 360 + }, + { + "clip_ratio": 0.0, + "completion_length": 585.0625, + "epoch": 90.22222222222223, + "grad_norm": 1.8447806012116532, + "kl": 0.05377197265625, + "learning_rate": 2.5765353846995297e-08, + "loss": 0.049, + "reward": 4.53125, + "reward_std": 3.1164740920066833, + "rewards/accuracy_reward_staging": 0.5625, + "rewards/format_reward": 0.8125, + "rewards/format_reward_staging": 0.90625, + "step": 361 + }, + { + "clip_ratio": 0.0, + "completion_length": 650.21875, + "epoch": 90.44444444444444, + "grad_norm": 1.184344841195356, + "kl": 0.04827880859375, + "learning_rate": 2.4471741852423233e-08, + "loss": 0.0432, + "reward": 2.96875, + "reward_std": 1.5483438968658447, + "rewards/accuracy_reward_staging": 0.21875, + "rewards/format_reward": 0.9375, + "rewards/format_reward_staging": 0.9375, + "step": 362 + }, + { + "clip_ratio": 0.0, + "completion_length": 547.875, + "epoch": 90.66666666666667, + "grad_norm": 1.4816241418571312, + "kl": 0.065185546875, + "learning_rate": 2.3210631582627927e-08, + "loss": -0.007, + "reward": 4.53125, + "reward_std": 2.563981920480728, + "rewards/accuracy_reward_staging": 0.53125, + "rewards/format_reward": 0.90625, + "rewards/format_reward_staging": 0.96875, + "step": 363 + }, + { + "clip_ratio": 0.0, + "completion_length": 646.875, + "epoch": 90.88888888888889, + "grad_norm": 1.4324587884354845, + "kl": 0.0557861328125, + "learning_rate": 2.1982109232821176e-08, + "loss": 0.0456, + "reward": 4.6875, + "reward_std": 2.595020294189453, + "rewards/accuracy_reward_staging": 0.5625, + "rewards/format_reward": 0.90625, + "rewards/format_reward_staging": 0.96875, + "step": 364 + }, + { + "clip_ratio": 0.0, + "completion_length": 609.875, + "epoch": 91.22222222222223, + "grad_norm": 1.406708270283001, + "kl": 0.04571533203125, + "learning_rate": 2.0786258770873645e-08, + "loss": -0.0323, + "reward": 5.28125, + "reward_std": 2.358702301979065, + "rewards/accuracy_reward_staging": 0.65625, + "rewards/format_reward": 1.0, + "rewards/format_reward_staging": 1.0, + "step": 365 + }, + { + "clip_ratio": 0.0, + "completion_length": 625.96875, + "epoch": 91.44444444444444, + "grad_norm": 1.254136427080868, + "kl": 0.0477294921875, + "learning_rate": 1.9623161931575926e-08, + "loss": 0.0391, + "reward": 4.25, + "reward_std": 1.3912444412708282, + "rewards/accuracy_reward_staging": 0.46875, + "rewards/format_reward": 0.9375, + "rewards/format_reward_staging": 0.96875, + "step": 366 + }, + { + "clip_ratio": 0.0, + "completion_length": 584.8125, + "epoch": 91.66666666666667, + "grad_norm": 1.4411232733747643, + "kl": 0.057861328125, + "learning_rate": 1.849289821105199e-08, + "loss": 0.0171, + "reward": 3.125, + "reward_std": 1.5756275057792664, + "rewards/accuracy_reward_staging": 0.25, + "rewards/format_reward": 0.9375, + "rewards/format_reward_staging": 0.9375, + "step": 367 + }, + { + "clip_ratio": 0.0, + "completion_length": 629.1875, + "epoch": 91.88888888888889, + "grad_norm": 1.1371389339839404, + "kl": 0.051513671875, + "learning_rate": 1.7395544861325718e-08, + "loss": 0.011, + "reward": 3.53125, + "reward_std": 1.816932737827301, + "rewards/accuracy_reward_staging": 0.3125, + "rewards/format_reward": 0.96875, + "rewards/format_reward_staging": 1.0, + "step": 368 + }, + { + "clip_ratio": 0.0, + "completion_length": 615.21875, + "epoch": 92.22222222222223, + "grad_norm": 1.2832104145352503, + "kl": 0.046142578125, + "learning_rate": 1.6331176885040876e-08, + "loss": 0.0567, + "reward": 3.78125, + "reward_std": 1.9511407613754272, + "rewards/accuracy_reward_staging": 0.375, + "rewards/format_reward": 0.9375, + "rewards/format_reward_staging": 0.96875, + "step": 369 + }, + { + "clip_ratio": 0.0, + "completion_length": 561.375, + "epoch": 92.44444444444444, + "grad_norm": 1.4397679570391773, + "kl": 0.05340576171875, + "learning_rate": 1.5299867030334813e-08, + "loss": 0.0089, + "reward": 3.1875, + "reward_std": 1.2975594997406006, + "rewards/accuracy_reward_staging": 0.25, + "rewards/format_reward": 0.9375, + "rewards/format_reward_staging": 1.0, + "step": 370 + }, + { + "clip_ratio": 0.0, + "completion_length": 547.0625, + "epoch": 92.66666666666667, + "grad_norm": 1.5669540097130739, + "kl": 0.066650390625, + "learning_rate": 1.4301685785866213e-08, + "loss": -0.0198, + "reward": 4.46875, + "reward_std": 2.9167675375938416, + "rewards/accuracy_reward_staging": 0.5, + "rewards/format_reward": 0.96875, + "rewards/format_reward_staging": 1.0, + "step": 371 + }, + { + "clip_ratio": 0.0, + "completion_length": 588.1875, + "epoch": 92.88888888888889, + "grad_norm": 1.6359800713030668, + "kl": 0.05194091796875, + "learning_rate": 1.3336701375997127e-08, + "loss": 0.0226, + "reward": 4.1875, + "reward_std": 2.957531690597534, + "rewards/accuracy_reward_staging": 0.4375, + "rewards/format_reward": 1.0, + "rewards/format_reward_staging": 1.0, + "step": 372 + }, + { + "clip_ratio": 0.0, + "completion_length": 596.375, + "epoch": 93.22222222222223, + "grad_norm": 22.007037588121534, + "kl": 0.2752685546875, + "learning_rate": 1.240497975613014e-08, + "loss": -0.0325, + "reward": 3.75, + "reward_std": 1.8229495882987976, + "rewards/accuracy_reward_staging": 0.40625, + "rewards/format_reward": 0.8125, + "rewards/format_reward_staging": 0.90625, + "step": 373 + }, + { + "clip_ratio": 0.0, + "completion_length": 554.75, + "epoch": 93.44444444444444, + "grad_norm": 1.5726784994170007, + "kl": 0.05316162109375, + "learning_rate": 1.1506584608200364e-08, + "loss": 0.0904, + "reward": 2.75, + "reward_std": 1.680722177028656, + "rewards/accuracy_reward_staging": 0.1875, + "rewards/format_reward": 0.90625, + "rewards/format_reward_staging": 0.90625, + "step": 374 + }, + { + "clip_ratio": 0.0, + "completion_length": 597.53125, + "epoch": 93.66666666666667, + "grad_norm": 1.2719220301062524, + "kl": 0.05828857421875, + "learning_rate": 1.0641577336322761e-08, + "loss": 0.0199, + "reward": 4.96875, + "reward_std": 2.2166852056980133, + "rewards/accuracy_reward_staging": 0.625, + "rewards/format_reward": 0.875, + "rewards/format_reward_staging": 0.96875, + "step": 375 + }, + { + "clip_ratio": 0.0, + "completion_length": 640.1875, + "epoch": 93.88888888888889, + "grad_norm": 2.653449397011027, + "kl": 0.07366943359375, + "learning_rate": 9.810017062595321e-09, + "loss": 0.0336, + "reward": 4.0, + "reward_std": 2.329674154520035, + "rewards/accuracy_reward_staging": 0.4375, + "rewards/format_reward": 0.875, + "rewards/format_reward_staging": 0.9375, + "step": 376 + }, + { + "clip_ratio": 0.0, + "completion_length": 511.21875, + "epoch": 94.22222222222223, + "grad_norm": 1.4821827826071337, + "kl": 0.04779052734375, + "learning_rate": 9.011960623058201e-09, + "loss": -0.0241, + "reward": 4.53125, + "reward_std": 1.9632892608642578, + "rewards/accuracy_reward_staging": 0.53125, + "rewards/format_reward": 0.90625, + "rewards/format_reward_staging": 0.96875, + "step": 377 + }, + { + "clip_ratio": 0.0, + "completion_length": 516.34375, + "epoch": 94.44444444444444, + "grad_norm": 1.424328758939937, + "kl": 0.055419921875, + "learning_rate": 8.247462563808816e-09, + "loss": 0.018, + "reward": 4.46875, + "reward_std": 2.4695461988449097, + "rewards/accuracy_reward_staging": 0.5, + "rewards/format_reward": 0.96875, + "rewards/format_reward_staging": 1.0, + "step": 378 + }, + { + "clip_ratio": 0.0, + "completion_length": 630.75, + "epoch": 94.66666666666667, + "grad_norm": 1.3627094949939382, + "kl": 0.05291748046875, + "learning_rate": 7.516575137274162e-09, + "loss": 0.05, + "reward": 3.9375, + "reward_std": 2.0698782801628113, + "rewards/accuracy_reward_staging": 0.40625, + "rewards/format_reward": 0.9375, + "rewards/format_reward_staging": 0.96875, + "step": 379 + }, + { + "clip_ratio": 0.0, + "completion_length": 632.625, + "epoch": 94.88888888888889, + "grad_norm": 1.1699961416103346, + "kl": 0.05120849609375, + "learning_rate": 6.819348298638839e-09, + "loss": 0.0182, + "reward": 3.1875, + "reward_std": 2.152123808860779, + "rewards/accuracy_reward_staging": 0.25, + "rewards/format_reward": 0.96875, + "rewards/format_reward_staging": 0.96875, + "step": 380 + }, + { + "clip_ratio": 0.0, + "completion_length": 618.8125, + "epoch": 95.22222222222223, + "grad_norm": 1.362463533881549, + "kl": 0.06512451171875, + "learning_rate": 6.15582970243117e-09, + "loss": 0.0677, + "reward": 3.375, + "reward_std": 1.7910222113132477, + "rewards/accuracy_reward_staging": 0.3125, + "rewards/format_reward": 0.875, + "rewards/format_reward_staging": 0.9375, + "step": 381 + }, + { + "clip_ratio": 0.0, + "completion_length": 551.15625, + "epoch": 95.44444444444444, + "grad_norm": 1.297999563247604, + "kl": 0.06048583984375, + "learning_rate": 5.526064699265753e-09, + "loss": 0.0032, + "reward": 3.96875, + "reward_std": 1.8312554359436035, + "rewards/accuracy_reward_staging": 0.40625, + "rewards/format_reward": 0.96875, + "rewards/format_reward_staging": 0.96875, + "step": 382 + }, + { + "clip_ratio": 0.0, + "completion_length": 604.625, + "epoch": 95.66666666666667, + "grad_norm": 1.486555198053525, + "kl": 0.05755615234375, + "learning_rate": 4.9300963327441044e-09, + "loss": 0.043, + "reward": 4.40625, + "reward_std": 3.2432121634483337, + "rewards/accuracy_reward_staging": 0.5, + "rewards/format_reward": 0.9375, + "rewards/format_reward_staging": 0.96875, + "step": 383 + }, + { + "clip_ratio": 0.0, + "completion_length": 583.875, + "epoch": 95.88888888888889, + "grad_norm": 1.5255994079961044, + "kl": 0.05120849609375, + "learning_rate": 4.367965336512403e-09, + "loss": -0.0079, + "reward": 3.8125, + "reward_std": 1.9035333096981049, + "rewards/accuracy_reward_staging": 0.375, + "rewards/format_reward": 0.9375, + "rewards/format_reward_staging": 1.0, + "step": 384 + }, + { + "clip_ratio": 0.0, + "completion_length": 500.53125, + "epoch": 96.22222222222223, + "grad_norm": 1.3113670847804533, + "kl": 0.05438232421875, + "learning_rate": 3.8397101314774915e-09, + "loss": -0.0184, + "reward": 3.28125, + "reward_std": 1.0818375647068024, + "rewards/accuracy_reward_staging": 0.28125, + "rewards/format_reward": 0.9375, + "rewards/format_reward_staging": 0.9375, + "step": 385 + }, + { + "clip_ratio": 0.0, + "completion_length": 554.8125, + "epoch": 96.44444444444444, + "grad_norm": 1.7546175198232294, + "kl": 0.056640625, + "learning_rate": 3.3453668231809283e-09, + "loss": -0.0321, + "reward": 5.5, + "reward_std": 3.5806562304496765, + "rewards/accuracy_reward_staging": 0.71875, + "rewards/format_reward": 0.9375, + "rewards/format_reward_staging": 0.96875, + "step": 386 + }, + { + "clip_ratio": 0.0, + "completion_length": 632.9375, + "epoch": 96.66666666666667, + "grad_norm": 1.3830980489663667, + "kl": 0.050537109375, + "learning_rate": 2.8849691993311777e-09, + "loss": 0.0483, + "reward": 3.59375, + "reward_std": 2.2675071954727173, + "rewards/accuracy_reward_staging": 0.34375, + "rewards/format_reward": 0.90625, + "rewards/format_reward_staging": 0.96875, + "step": 387 + }, + { + "clip_ratio": 0.0, + "completion_length": 565.0, + "epoch": 96.88888888888889, + "grad_norm": 1.6187360157512092, + "kl": 0.0645751953125, + "learning_rate": 2.458548727494292e-09, + "loss": 0.0672, + "reward": 4.1875, + "reward_std": 2.5254639387130737, + "rewards/accuracy_reward_staging": 0.5, + "rewards/format_reward": 0.78125, + "rewards/format_reward_staging": 0.90625, + "step": 388 + }, + { + "clip_ratio": 0.0, + "completion_length": 601.3125, + "epoch": 97.22222222222223, + "grad_norm": 1.513261818035226, + "kl": 0.05316162109375, + "learning_rate": 2.066134552943077e-09, + "loss": -0.054, + "reward": 4.0, + "reward_std": 2.443375587463379, + "rewards/accuracy_reward_staging": 0.40625, + "rewards/format_reward": 0.96875, + "rewards/format_reward_staging": 1.0, + "step": 389 + }, + { + "clip_ratio": 0.0, + "completion_length": 618.53125, + "epoch": 97.44444444444444, + "grad_norm": 1.2736528121828654, + "kl": 0.04791259765625, + "learning_rate": 1.7077534966650765e-09, + "loss": 0.0219, + "reward": 5.375, + "reward_std": 2.514360010623932, + "rewards/accuracy_reward_staging": 0.6875, + "rewards/format_reward": 0.9375, + "rewards/format_reward_staging": 1.0, + "step": 390 + }, + { + "clip_ratio": 0.0, + "completion_length": 593.1875, + "epoch": 97.66666666666667, + "grad_norm": 1.5869628592439236, + "kl": 0.0782470703125, + "learning_rate": 1.383430053529422e-09, + "loss": -0.01, + "reward": 3.40625, + "reward_std": 1.3685379922389984, + "rewards/accuracy_reward_staging": 0.3125, + "rewards/format_reward": 0.90625, + "rewards/format_reward_staging": 0.9375, + "step": 391 + }, + { + "clip_ratio": 0.0, + "completion_length": 646.5, + "epoch": 97.88888888888889, + "grad_norm": 1.3206401873227125, + "kl": 0.052490234375, + "learning_rate": 1.0931863906127325e-09, + "loss": -0.0253, + "reward": 3.65625, + "reward_std": 1.6694981455802917, + "rewards/accuracy_reward_staging": 0.34375, + "rewards/format_reward": 0.96875, + "rewards/format_reward_staging": 0.96875, + "step": 392 + }, + { + "clip_ratio": 0.0, + "completion_length": 573.78125, + "epoch": 98.22222222222223, + "grad_norm": 1.8394528163274233, + "kl": 0.0552978515625, + "learning_rate": 8.370423456837139e-10, + "loss": 0.0136, + "reward": 4.625, + "reward_std": 2.260310411453247, + "rewards/accuracy_reward_staging": 0.5625, + "rewards/format_reward": 0.84375, + "rewards/format_reward_staging": 0.96875, + "step": 393 + }, + { + "clip_ratio": 0.0, + "completion_length": 621.53125, + "epoch": 98.44444444444444, + "grad_norm": 1.5558051348782826, + "kl": 0.06475830078125, + "learning_rate": 6.150154258476314e-10, + "loss": -0.0687, + "reward": 4.71875, + "reward_std": 2.5687596797943115, + "rewards/accuracy_reward_staging": 0.5625, + "rewards/format_reward": 0.96875, + "rewards/format_reward_staging": 0.9375, + "step": 394 + }, + { + "clip_ratio": 0.0, + "completion_length": 595.75, + "epoch": 98.66666666666667, + "grad_norm": 1.1453949481051802, + "kl": 0.048095703125, + "learning_rate": 4.271208063494902e-10, + "loss": -0.0004, + "reward": 2.5625, + "reward_std": 1.1108438968658447, + "rewards/accuracy_reward_staging": 0.125, + "rewards/format_reward": 0.96875, + "rewards/format_reward_staging": 0.96875, + "step": 395 + }, + { + "clip_ratio": 0.0, + "completion_length": 612.9375, + "epoch": 98.88888888888889, + "grad_norm": 1.627509813072313, + "kl": 0.05230712890625, + "learning_rate": 2.733713295369755e-10, + "loss": -0.0208, + "reward": 4.375, + "reward_std": 2.401917338371277, + "rewards/accuracy_reward_staging": 0.5, + "rewards/format_reward": 0.9375, + "rewards/format_reward_staging": 0.9375, + "step": 396 + }, + { + "clip_ratio": 0.0, + "completion_length": 598.03125, + "epoch": 99.22222222222223, + "grad_norm": 1.3542702485509543, + "kl": 0.0565185546875, + "learning_rate": 1.53777503982655e-10, + "loss": 0.0102, + "reward": 3.3125, + "reward_std": 2.002065122127533, + "rewards/accuracy_reward_staging": 0.28125, + "rewards/format_reward": 0.90625, + "rewards/format_reward_staging": 1.0, + "step": 397 + }, + { + "clip_ratio": 0.0, + "completion_length": 589.84375, + "epoch": 99.44444444444444, + "grad_norm": 1.524546929028841, + "kl": 0.06207275390625, + "learning_rate": 6.834750376549791e-11, + "loss": 0.0366, + "reward": 4.625, + "reward_std": 2.895161896944046, + "rewards/accuracy_reward_staging": 0.5625, + "rewards/format_reward": 0.875, + "rewards/format_reward_staging": 0.9375, + "step": 398 + }, + { + "clip_ratio": 0.0, + "completion_length": 563.6875, + "epoch": 99.66666666666667, + "grad_norm": 1.4232250672089828, + "kl": 0.05712890625, + "learning_rate": 1.7087167912710476e-11, + "loss": 0.0203, + "reward": 3.875, + "reward_std": 1.875, + "rewards/accuracy_reward_staging": 0.375, + "rewards/format_reward": 1.0, + "rewards/format_reward_staging": 1.0, + "step": 399 + }, + { + "epoch": 99.88888888888889, + "grad_norm": 1.8247807880781484, + "learning_rate": 0.0, + "loss": 0.0974, + "step": 400 + }, + { + "epoch": 99.88888888888889, + "eval_clip_ratio": 0.0, + "eval_completion_length": 644.175, + "eval_kl": 0.052978515625, + "eval_loss": 0.008646870031952858, + "eval_reward": 2.3, + "eval_reward_std": 1.2995877504348754, + "eval_rewards/accuracy_reward_staging": 0.125, + "eval_rewards/format_reward": 0.8, + "eval_rewards/format_reward_staging": 0.875, + "eval_runtime": 54.9436, + "eval_samples_per_second": 0.655, + "eval_steps_per_second": 0.091, + "step": 400 + }, + { + "clip_ratio": 0.0, + "completion_length": 498.375, + "epoch": 99.88888888888889, + "kl": 0.0625, + "reward": 3.28125, + "reward_std": 2.418270230293274, + "rewards/accuracy_reward_staging": 0.28125, + "rewards/format_reward": 0.9375, + "rewards/format_reward_staging": 0.9375, + "step": 400, + "total_flos": 0.0, + "train_loss": 0.01939338302021497, + "train_runtime": 14839.7642, + "train_samples_per_second": 0.243, + "train_steps_per_second": 0.027 + } + ], + "logging_steps": 1, + "max_steps": 400, + "num_input_tokens_seen": 0, + "num_train_epochs": 100, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}