{ "best_metric": null, "best_model_checkpoint": null, "epoch": 99.88888888888889, "eval_steps": 50, "global_step": 400, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio": 0.0, "completion_length": 535.125, "epoch": 0.2222222222222222, "grad_norm": 1.7916724271880604, "kl": 0.0, "learning_rate": 5e-08, "loss": 0.0583, "reward": 2.3125, "reward_std": 1.1971687823534012, "rewards/accuracy_reward_staging": 0.09375, "rewards/format_reward": 0.875, "rewards/format_reward_staging": 0.96875, "step": 1 }, { "clip_ratio": 0.0, "completion_length": 555.90625, "epoch": 0.4444444444444444, "grad_norm": 1.5555075403521712, "kl": 0.0, "learning_rate": 1e-07, "loss": -0.0705, "reward": 2.5625, "reward_std": 1.2858919501304626, "rewards/accuracy_reward_staging": 0.15625, "rewards/format_reward": 0.84375, "rewards/format_reward_staging": 0.9375, "step": 2 }, { "clip_ratio": 0.0, "completion_length": 541.46875, "epoch": 0.6666666666666666, "grad_norm": 1.6594522931688669, "kl": 0.0010576248168945312, "learning_rate": 1.5e-07, "loss": -0.0235, "reward": 2.59375, "reward_std": 1.6232599020004272, "rewards/accuracy_reward_staging": 0.15625, "rewards/format_reward": 0.90625, "rewards/format_reward_staging": 0.90625, "step": 3 }, { "clip_ratio": 0.0, "completion_length": 613.25, "epoch": 0.8888888888888888, "grad_norm": 2.3276142189283164, "kl": 0.0011081695556640625, "learning_rate": 2e-07, "loss": 0.1029, "reward": 2.875, "reward_std": 1.8071783781051636, "rewards/accuracy_reward_staging": 0.21875, "rewards/format_reward": 0.8125, "rewards/format_reward_staging": 0.96875, "step": 4 }, { "clip_ratio": 0.0, "completion_length": 649.84375, "epoch": 1.2222222222222223, "grad_norm": 1.5167959821278052, "kl": 0.0010709762573242188, "learning_rate": 2.5e-07, "loss": 0.0003, "reward": 2.84375, "reward_std": 1.7606024742126465, "rewards/accuracy_reward_staging": 0.21875, "rewards/format_reward": 0.78125, "rewards/format_reward_staging": 0.96875, "step": 5 }, { "clip_ratio": 0.0, "completion_length": 574.25, "epoch": 1.4444444444444444, "grad_norm": 1.491122536644779, "kl": 0.0009145736694335938, "learning_rate": 3e-07, "loss": 0.0377, "reward": 2.75, "reward_std": 1.8017165958881378, "rewards/accuracy_reward_staging": 0.1875, "rewards/format_reward": 0.90625, "rewards/format_reward_staging": 0.90625, "step": 6 }, { "clip_ratio": 0.0, "completion_length": 550.46875, "epoch": 1.6666666666666665, "grad_norm": 1.5321454699600687, "kl": 0.0016422271728515625, "learning_rate": 3.5e-07, "loss": 0.0173, "reward": 2.8125, "reward_std": 1.498587191104889, "rewards/accuracy_reward_staging": 0.1875, "rewards/format_reward": 0.90625, "rewards/format_reward_staging": 0.96875, "step": 7 }, { "clip_ratio": 0.0, "completion_length": 530.0625, "epoch": 1.8888888888888888, "grad_norm": 1.7429693147530465, "kl": 0.0010614395141601562, "learning_rate": 4e-07, "loss": 0.0413, "reward": 3.15625, "reward_std": 2.1272581219673157, "rewards/accuracy_reward_staging": 0.28125, "rewards/format_reward": 0.78125, "rewards/format_reward_staging": 0.96875, "step": 8 }, { "clip_ratio": 0.0, "completion_length": 643.875, "epoch": 2.2222222222222223, "grad_norm": 1.53726074310182, "kl": 0.0013751983642578125, "learning_rate": 4.5e-07, "loss": -0.005, "reward": 3.125, "reward_std": 2.054091453552246, "rewards/accuracy_reward_staging": 0.25, "rewards/format_reward": 0.90625, "rewards/format_reward_staging": 0.96875, "step": 9 }, { "clip_ratio": 0.0, "completion_length": 614.09375, "epoch": 2.4444444444444446, "grad_norm": 1.3654100960829842, "kl": 0.0012149810791015625, "learning_rate": 5e-07, "loss": -0.0164, "reward": 2.59375, "reward_std": 1.0483438968658447, "rewards/accuracy_reward_staging": 0.125, "rewards/format_reward": 0.96875, "rewards/format_reward_staging": 1.0, "step": 10 }, { "clip_ratio": 0.0, "completion_length": 555.3125, "epoch": 2.6666666666666665, "grad_norm": 1.4260001116361793, "kl": 0.0010051727294921875, "learning_rate": 5.5e-07, "loss": 0.0251, "reward": 3.0625, "reward_std": 1.7733518332242966, "rewards/accuracy_reward_staging": 0.25, "rewards/format_reward": 0.84375, "rewards/format_reward_staging": 0.96875, "step": 11 }, { "clip_ratio": 0.0, "completion_length": 552.5625, "epoch": 2.888888888888889, "grad_norm": 1.5253120629648043, "kl": 0.001361846923828125, "learning_rate": 6e-07, "loss": 0.0285, "reward": 3.3125, "reward_std": 1.9136751294136047, "rewards/accuracy_reward_staging": 0.28125, "rewards/format_reward": 0.90625, "rewards/format_reward_staging": 1.0, "step": 12 }, { "clip_ratio": 0.0, "completion_length": 579.71875, "epoch": 3.2222222222222223, "grad_norm": 1.5612924435198745, "kl": 0.0019207000732421875, "learning_rate": 6.5e-07, "loss": 0.0829, "reward": 2.0625, "reward_std": 0.5475594997406006, "rewards/accuracy_reward_staging": 0.0625, "rewards/format_reward": 0.8125, "rewards/format_reward_staging": 0.9375, "step": 13 }, { "clip_ratio": 0.0, "completion_length": 648.125, "epoch": 3.4444444444444446, "grad_norm": 1.472369166378751, "kl": 0.0019435882568359375, "learning_rate": 7e-07, "loss": 0.0889, "reward": 2.3125, "reward_std": 1.3669461011886597, "rewards/accuracy_reward_staging": 0.125, "rewards/format_reward": 0.84375, "rewards/format_reward_staging": 0.84375, "step": 14 }, { "clip_ratio": 0.0, "completion_length": 660.78125, "epoch": 3.6666666666666665, "grad_norm": 1.2833764786982476, "kl": 0.00171661376953125, "learning_rate": 7.5e-07, "loss": -0.0032, "reward": 2.28125, "reward_std": 0.9946783781051636, "rewards/accuracy_reward_staging": 0.09375, "rewards/format_reward": 0.84375, "rewards/format_reward_staging": 0.96875, "step": 15 }, { "clip_ratio": 0.0, "completion_length": 555.625, "epoch": 3.888888888888889, "grad_norm": 1.7981216304584955, "kl": 0.003185272216796875, "learning_rate": 8e-07, "loss": 0.0022, "reward": 4.09375, "reward_std": 2.7086294293403625, "rewards/accuracy_reward_staging": 0.4375, "rewards/format_reward": 0.9375, "rewards/format_reward_staging": 0.96875, "step": 16 }, { "clip_ratio": 0.0, "completion_length": 581.375, "epoch": 4.222222222222222, "grad_norm": 1.8924801483136653, "kl": 0.003849029541015625, "learning_rate": 8.499999999999999e-07, "loss": 0.0192, "reward": 2.8125, "reward_std": 1.4357599020004272, "rewards/accuracy_reward_staging": 0.1875, "rewards/format_reward": 0.90625, "rewards/format_reward_staging": 0.96875, "step": 17 }, { "clip_ratio": 0.0, "completion_length": 626.875, "epoch": 4.444444444444445, "grad_norm": 1.4237753323985947, "kl": 0.004940032958984375, "learning_rate": 9e-07, "loss": 0.0048, "reward": 2.78125, "reward_std": 1.6875, "rewards/accuracy_reward_staging": 0.1875, "rewards/format_reward": 0.875, "rewards/format_reward_staging": 0.96875, "step": 18 }, { "clip_ratio": 0.0, "completion_length": 583.875, "epoch": 4.666666666666667, "grad_norm": 1.4401282377616447, "kl": 0.00505828857421875, "learning_rate": 9.499999999999999e-07, "loss": 0.0016, "reward": 3.4375, "reward_std": 2.3147872537374496, "rewards/accuracy_reward_staging": 0.34375, "rewards/format_reward": 0.8125, "rewards/format_reward_staging": 0.90625, "step": 19 }, { "clip_ratio": 0.0, "completion_length": 562.8125, "epoch": 4.888888888888889, "grad_norm": 1.1629869227175655, "kl": 0.00585174560546875, "learning_rate": 1e-06, "loss": 0.0, "reward": 2.5625, "reward_std": 0.9797460436820984, "rewards/accuracy_reward_staging": 0.125, "rewards/format_reward": 0.9375, "rewards/format_reward_staging": 1.0, "step": 20 }, { "clip_ratio": 0.0, "completion_length": 614.875, "epoch": 5.222222222222222, "grad_norm": 1.6115188653051613, "kl": 0.00612640380859375, "learning_rate": 9.999829128320873e-07, "loss": 0.0565, "reward": 3.28125, "reward_std": 2.4976893961429596, "rewards/accuracy_reward_staging": 0.28125, "rewards/format_reward": 0.90625, "rewards/format_reward_staging": 0.96875, "step": 21 }, { "clip_ratio": 0.0, "completion_length": 559.46875, "epoch": 5.444444444444445, "grad_norm": 1.465512353981508, "kl": 0.00824737548828125, "learning_rate": 9.999316524962345e-07, "loss": 0.0541, "reward": 3.3125, "reward_std": 1.8101893961429596, "rewards/accuracy_reward_staging": 0.28125, "rewards/format_reward": 0.9375, "rewards/format_reward_staging": 0.96875, "step": 22 }, { "clip_ratio": 0.0, "completion_length": 572.59375, "epoch": 5.666666666666667, "grad_norm": 1.5847579776558225, "kl": 0.0093841552734375, "learning_rate": 9.998462224960173e-07, "loss": 0.06, "reward": 3.6875, "reward_std": 2.443375587463379, "rewards/accuracy_reward_staging": 0.34375, "rewards/format_reward": 0.96875, "rewards/format_reward_staging": 1.0, "step": 23 }, { "clip_ratio": 0.0, "completion_length": 645.9375, "epoch": 5.888888888888889, "grad_norm": 1.8362203993654154, "kl": 0.00734710693359375, "learning_rate": 9.99726628670463e-07, "loss": 0.0368, "reward": 3.03125, "reward_std": 2.283504918217659, "rewards/accuracy_reward_staging": 0.28125, "rewards/format_reward": 0.71875, "rewards/format_reward_staging": 0.90625, "step": 24 }, { "clip_ratio": 0.0, "completion_length": 572.5, "epoch": 6.222222222222222, "grad_norm": 1.6415108932304052, "kl": 0.0096588134765625, "learning_rate": 9.995728791936505e-07, "loss": 0.0267, "reward": 2.96875, "reward_std": 1.7760016024112701, "rewards/accuracy_reward_staging": 0.21875, "rewards/format_reward": 0.90625, "rewards/format_reward_staging": 0.96875, "step": 25 }, { "clip_ratio": 0.0, "completion_length": 579.96875, "epoch": 6.444444444444445, "grad_norm": 1.4689069714869325, "kl": 0.010345458984375, "learning_rate": 9.993849845741523e-07, "loss": 0.1034, "reward": 2.5625, "reward_std": 1.1108438968658447, "rewards/accuracy_reward_staging": 0.15625, "rewards/format_reward": 0.875, "rewards/format_reward_staging": 0.90625, "step": 26 }, { "clip_ratio": 0.0, "completion_length": 542.46875, "epoch": 6.666666666666667, "grad_norm": 1.7253968854719324, "kl": 0.01122283935546875, "learning_rate": 9.991629576543163e-07, "loss": -0.0129, "reward": 2.625, "reward_std": 1.316565990447998, "rewards/accuracy_reward_staging": 0.1875, "rewards/format_reward": 0.78125, "rewards/format_reward_staging": 0.90625, "step": 27 }, { "clip_ratio": 0.0, "completion_length": 598.0, "epoch": 6.888888888888889, "grad_norm": 1.439672104037944, "kl": 0.0132293701171875, "learning_rate": 9.989068136093872e-07, "loss": 0.0324, "reward": 3.375, "reward_std": 2.423195868730545, "rewards/accuracy_reward_staging": 0.3125, "rewards/format_reward": 0.875, "rewards/format_reward_staging": 0.9375, "step": 28 }, { "clip_ratio": 0.0, "completion_length": 557.5625, "epoch": 7.222222222222222, "grad_norm": 1.53093980357088, "kl": 0.0146942138671875, "learning_rate": 9.986165699464705e-07, "loss": -0.0074, "reward": 3.125, "reward_std": 2.0308370888233185, "rewards/accuracy_reward_staging": 0.25, "rewards/format_reward": 0.90625, "rewards/format_reward_staging": 0.96875, "step": 29 }, { "clip_ratio": 0.0, "completion_length": 574.90625, "epoch": 7.444444444444445, "grad_norm": 1.0715134693817079, "kl": 0.0147857666015625, "learning_rate": 9.982922465033348e-07, "loss": -0.0166, "reward": 2.5, "reward_std": 0.9858438968658447, "rewards/accuracy_reward_staging": 0.125, "rewards/format_reward": 0.875, "rewards/format_reward_staging": 1.0, "step": 30 }, { "clip_ratio": 0.0, "completion_length": 602.65625, "epoch": 7.666666666666667, "grad_norm": 1.4389686833903352, "kl": 0.01611328125, "learning_rate": 9.979338654470567e-07, "loss": 0.0875, "reward": 2.4375, "reward_std": 1.2930222898721695, "rewards/accuracy_reward_staging": 0.125, "rewards/format_reward": 0.875, "rewards/format_reward_staging": 0.9375, "step": 31 }, { "clip_ratio": 0.0, "completion_length": 669.5625, "epoch": 7.888888888888889, "grad_norm": 1.0489321524468773, "kl": 0.01910400390625, "learning_rate": 9.975414512725056e-07, "loss": 0.0185, "reward": 2.5625, "reward_std": 1.037847101688385, "rewards/accuracy_reward_staging": 0.125, "rewards/format_reward": 0.96875, "rewards/format_reward_staging": 0.96875, "step": 32 }, { "clip_ratio": 0.0, "completion_length": 532.625, "epoch": 8.222222222222221, "grad_norm": 1.5157291140048736, "kl": 0.01885986328125, "learning_rate": 9.971150308006687e-07, "loss": -0.0001, "reward": 4.125, "reward_std": 2.000675529241562, "rewards/accuracy_reward_staging": 0.4375, "rewards/format_reward": 0.9375, "rewards/format_reward_staging": 1.0, "step": 33 }, { "clip_ratio": 0.0, "completion_length": 591.03125, "epoch": 8.444444444444445, "grad_norm": 1.5963578785319679, "kl": 0.0192413330078125, "learning_rate": 9.966546331768192e-07, "loss": 0.1269, "reward": 2.875, "reward_std": 2.112294152379036, "rewards/accuracy_reward_staging": 0.25, "rewards/format_reward": 0.78125, "rewards/format_reward_staging": 0.84375, "step": 34 }, { "clip_ratio": 0.0, "completion_length": 603.09375, "epoch": 8.666666666666666, "grad_norm": 1.4508455813252856, "kl": 0.01494598388671875, "learning_rate": 9.961602898685223e-07, "loss": 0.0585, "reward": 3.3125, "reward_std": 2.0126227736473083, "rewards/accuracy_reward_staging": 0.3125, "rewards/format_reward": 0.8125, "rewards/format_reward_staging": 0.9375, "step": 35 }, { "clip_ratio": 0.0, "completion_length": 579.9375, "epoch": 8.88888888888889, "grad_norm": 1.196537394176258, "kl": 0.0169830322265625, "learning_rate": 9.956320346634875e-07, "loss": 0.0166, "reward": 2.78125, "reward_std": 1.3710740953683853, "rewards/accuracy_reward_staging": 0.1875, "rewards/format_reward": 0.84375, "rewards/format_reward_staging": 1.0, "step": 36 }, { "clip_ratio": 0.0, "completion_length": 572.96875, "epoch": 9.222222222222221, "grad_norm": 1.4031846103728705, "kl": 0.0164794921875, "learning_rate": 9.95069903667256e-07, "loss": 0.0257, "reward": 2.65625, "reward_std": 1.4369846880435944, "rewards/accuracy_reward_staging": 0.1875, "rewards/format_reward": 0.8125, "rewards/format_reward_staging": 0.90625, "step": 37 }, { "clip_ratio": 0.0, "completion_length": 533.8125, "epoch": 9.444444444444445, "grad_norm": 1.7378697171564481, "kl": 0.019744873046875, "learning_rate": 9.944739353007341e-07, "loss": 0.0651, "reward": 3.6875, "reward_std": 2.841255784034729, "rewards/accuracy_reward_staging": 0.375, "rewards/format_reward": 0.84375, "rewards/format_reward_staging": 0.96875, "step": 38 }, { "clip_ratio": 0.0, "completion_length": 575.5625, "epoch": 9.666666666666666, "grad_norm": 1.6742496883549038, "kl": 0.018218994140625, "learning_rate": 9.938441702975689e-07, "loss": 0.0249, "reward": 2.4375, "reward_std": 1.2126952707767487, "rewards/accuracy_reward_staging": 0.125, "rewards/format_reward": 0.84375, "rewards/format_reward_staging": 0.96875, "step": 39 }, { "clip_ratio": 0.0, "completion_length": 541.375, "epoch": 9.88888888888889, "grad_norm": 1.6853780037379804, "kl": 0.0196533203125, "learning_rate": 9.931806517013612e-07, "loss": 0.0121, "reward": 2.5625, "reward_std": 1.3815238624811172, "rewards/accuracy_reward_staging": 0.15625, "rewards/format_reward": 0.84375, "rewards/format_reward_staging": 0.9375, "step": 40 }, { "clip_ratio": 0.0, "completion_length": 550.90625, "epoch": 10.222222222222221, "grad_norm": 1.2047759950332129, "kl": 0.017730712890625, "learning_rate": 9.924834248627258e-07, "loss": 0.0398, "reward": 2.8125, "reward_std": 1.4487498700618744, "rewards/accuracy_reward_staging": 0.1875, "rewards/format_reward": 0.90625, "rewards/format_reward_staging": 0.96875, "step": 41 }, { "clip_ratio": 0.0, "completion_length": 587.34375, "epoch": 10.444444444444445, "grad_norm": 2.2662890327219642, "kl": 0.032135009765625, "learning_rate": 9.917525374361911e-07, "loss": 0.0402, "reward": 3.375, "reward_std": 2.6460810601711273, "rewards/accuracy_reward_staging": 0.3125, "rewards/format_reward": 0.84375, "rewards/format_reward_staging": 0.96875, "step": 42 }, { "clip_ratio": 0.0, "completion_length": 552.4375, "epoch": 10.666666666666666, "grad_norm": 0.8485843884389722, "kl": 0.021148681640625, "learning_rate": 9.909880393769418e-07, "loss": 0.0349, "reward": 2.5, "reward_std": 1.045437604188919, "rewards/accuracy_reward_staging": 0.125, "rewards/format_reward": 0.9375, "rewards/format_reward_staging": 0.9375, "step": 43 }, { "clip_ratio": 0.0, "completion_length": 626.625, "epoch": 10.88888888888889, "grad_norm": 1.4242611362455049, "kl": 0.018280029296875, "learning_rate": 9.901899829374047e-07, "loss": 0.0405, "reward": 3.03125, "reward_std": 2.107846677303314, "rewards/accuracy_reward_staging": 0.25, "rewards/format_reward": 0.875, "rewards/format_reward_staging": 0.90625, "step": 44 }, { "clip_ratio": 0.0, "completion_length": 536.8125, "epoch": 11.222222222222221, "grad_norm": 1.6000184113652984, "kl": 0.025238037109375, "learning_rate": 9.893584226636772e-07, "loss": -0.0471, "reward": 2.78125, "reward_std": 1.6772827804088593, "rewards/accuracy_reward_staging": 0.1875, "rewards/format_reward": 0.84375, "rewards/format_reward_staging": 1.0, "step": 45 }, { "clip_ratio": 0.0, "completion_length": 588.65625, "epoch": 11.444444444444445, "grad_norm": 1.2633801476740014, "kl": 0.02093505859375, "learning_rate": 9.884934153917996e-07, "loss": 0.027, "reward": 2.4375, "reward_std": 1.226884126663208, "rewards/accuracy_reward_staging": 0.125, "rewards/format_reward": 0.875, "rewards/format_reward_staging": 0.9375, "step": 46 }, { "clip_ratio": 0.0, "completion_length": 524.1875, "epoch": 11.666666666666666, "grad_norm": 1.7228504370636915, "kl": 0.020599365234375, "learning_rate": 9.8759502024387e-07, "loss": -0.0016, "reward": 3.125, "reward_std": 1.8041669130325317, "rewards/accuracy_reward_staging": 0.28125, "rewards/format_reward": 0.8125, "rewards/format_reward_staging": 0.90625, "step": 47 }, { "clip_ratio": 0.0, "completion_length": 572.875, "epoch": 11.88888888888889, "grad_norm": 7.859881611636793, "kl": 0.063079833984375, "learning_rate": 9.866632986240029e-07, "loss": 0.0482, "reward": 3.25, "reward_std": 2.0755133628845215, "rewards/accuracy_reward_staging": 0.28125, "rewards/format_reward": 0.875, "rewards/format_reward_staging": 0.96875, "step": 48 }, { "clip_ratio": 0.0, "completion_length": 577.5625, "epoch": 12.222222222222221, "grad_norm": 1.7851397304147796, "kl": 0.0205078125, "learning_rate": 9.856983142141337e-07, "loss": 0.0509, "reward": 3.3125, "reward_std": 2.14286145567894, "rewards/accuracy_reward_staging": 0.3125, "rewards/format_reward": 0.8125, "rewards/format_reward_staging": 0.9375, "step": 49 }, { "epoch": 12.444444444444445, "grad_norm": 1.585137209096838, "learning_rate": 9.847001329696652e-07, "loss": -0.0125, "step": 50 }, { "epoch": 12.444444444444445, "eval_clip_ratio": 0.0, "eval_completion_length": 597.925, "eval_kl": 0.02578125, "eval_loss": 0.024221811443567276, "eval_reward": 2.625, "eval_reward_std": 1.6041045665740967, "eval_rewards/accuracy_reward_staging": 0.175, "eval_rewards/format_reward": 0.8, "eval_rewards/format_reward_staging": 0.95, "eval_runtime": 51.776, "eval_samples_per_second": 0.695, "eval_steps_per_second": 0.097, "step": 50 }, { "clip_ratio": 0.0, "completion_length": 567.046875, "epoch": 12.666666666666666, "grad_norm": 1.6497404550782266, "kl": 0.020294189453125, "learning_rate": 9.836688231149592e-07, "loss": -0.0235, "reward": 3.328125, "reward_std": 2.148952841758728, "rewards/accuracy_reward_staging": 0.296875, "rewards/format_reward": 0.890625, "rewards/format_reward_staging": 0.953125, "step": 51 }, { "clip_ratio": 0.0, "completion_length": 655.3125, "epoch": 12.88888888888889, "grad_norm": 1.0110588489868237, "kl": 0.018829345703125, "learning_rate": 9.826044551386742e-07, "loss": -0.0207, "reward": 2.5625, "reward_std": 1.046603798866272, "rewards/accuracy_reward_staging": 0.125, "rewards/format_reward": 0.96875, "rewards/format_reward_staging": 0.96875, "step": 52 }, { "clip_ratio": 0.0, "completion_length": 571.65625, "epoch": 13.222222222222221, "grad_norm": 1.5942717910970237, "kl": 0.0233154296875, "learning_rate": 9.81507101788948e-07, "loss": 0.0327, "reward": 2.96875, "reward_std": 2.0815286338329315, "rewards/accuracy_reward_staging": 0.21875, "rewards/format_reward": 0.875, "rewards/format_reward_staging": 1.0, "step": 53 }, { "clip_ratio": 0.0, "completion_length": 531.0, "epoch": 13.444444444444445, "grad_norm": 1.6431487531106521, "kl": 0.02325439453125, "learning_rate": 9.803768380684242e-07, "loss": -0.005, "reward": 3.1875, "reward_std": 2.4305797815322876, "rewards/accuracy_reward_staging": 0.28125, "rewards/format_reward": 0.875, "rewards/format_reward_staging": 0.90625, "step": 54 }, { "clip_ratio": 0.0, "completion_length": 530.71875, "epoch": 13.666666666666666, "grad_norm": 1.3532727186337274, "kl": 0.021026611328125, "learning_rate": 9.792137412291263e-07, "loss": -0.0091, "reward": 3.09375, "reward_std": 1.5625, "rewards/accuracy_reward_staging": 0.21875, "rewards/format_reward": 1.0, "rewards/format_reward_staging": 1.0, "step": 55 }, { "clip_ratio": 0.0, "completion_length": 573.375, "epoch": 13.88888888888889, "grad_norm": 1.4441812945667367, "kl": 0.024932861328125, "learning_rate": 9.780178907671788e-07, "loss": 0.0275, "reward": 3.34375, "reward_std": 2.1209341287612915, "rewards/accuracy_reward_staging": 0.3125, "rewards/format_reward": 0.84375, "rewards/format_reward_staging": 0.9375, "step": 56 }, { "clip_ratio": 0.0, "completion_length": 520.15625, "epoch": 14.222222222222221, "grad_norm": 1.6824005469371979, "kl": 0.026092529296875, "learning_rate": 9.76789368417372e-07, "loss": -0.0531, "reward": 2.8125, "reward_std": 1.377088338136673, "rewards/accuracy_reward_staging": 0.1875, "rewards/format_reward": 0.875, "rewards/format_reward_staging": 1.0, "step": 57 }, { "clip_ratio": 0.0, "completion_length": 599.96875, "epoch": 14.444444444444445, "grad_norm": 1.4915574785365073, "kl": 0.021026611328125, "learning_rate": 9.755282581475767e-07, "loss": 0.0364, "reward": 4.9375, "reward_std": 2.745547831058502, "rewards/accuracy_reward_staging": 0.59375, "rewards/format_reward": 0.96875, "rewards/format_reward_staging": 1.0, "step": 58 }, { "clip_ratio": 0.0, "completion_length": 549.71875, "epoch": 14.666666666666666, "grad_norm": 1.4821961551515155, "kl": 0.02593994140625, "learning_rate": 9.742346461530047e-07, "loss": 0.0872, "reward": 2.53125, "reward_std": 1.4375, "rewards/accuracy_reward_staging": 0.125, "rewards/format_reward": 0.9375, "rewards/format_reward_staging": 0.96875, "step": 59 }, { "clip_ratio": 0.0, "completion_length": 601.375, "epoch": 14.88888888888889, "grad_norm": 1.249530356824017, "kl": 0.023406982421875, "learning_rate": 9.729086208503173e-07, "loss": 0.0652, "reward": 2.4375, "reward_std": 1.1680222749710083, "rewards/accuracy_reward_staging": 0.125, "rewards/format_reward": 0.875, "rewards/format_reward_staging": 0.9375, "step": 60 }, { "clip_ratio": 0.0, "completion_length": 613.71875, "epoch": 15.222222222222221, "grad_norm": 1.4621397072761817, "kl": 0.0252685546875, "learning_rate": 9.715502728715825e-07, "loss": 0.0108, "reward": 2.96875, "reward_std": 1.8319481909275055, "rewards/accuracy_reward_staging": 0.21875, "rewards/format_reward": 0.90625, "rewards/format_reward_staging": 0.96875, "step": 61 }, { "clip_ratio": 0.0, "completion_length": 621.71875, "epoch": 15.444444444444445, "grad_norm": 1.4960047973343167, "kl": 0.023590087890625, "learning_rate": 9.701596950580807e-07, "loss": -0.008, "reward": 3.21875, "reward_std": 2.3255662322044373, "rewards/accuracy_reward_staging": 0.28125, "rewards/format_reward": 0.875, "rewards/format_reward_staging": 0.9375, "step": 62 }, { "clip_ratio": 0.0, "completion_length": 559.0, "epoch": 15.666666666666666, "grad_norm": 1.377229747116843, "kl": 0.031097412109375, "learning_rate": 9.687369824539576e-07, "loss": 0.072, "reward": 2.9375, "reward_std": 1.7239685356616974, "rewards/accuracy_reward_staging": 0.21875, "rewards/format_reward": 0.875, "rewards/format_reward_staging": 0.96875, "step": 63 }, { "clip_ratio": 0.0, "completion_length": 542.125, "epoch": 15.88888888888889, "grad_norm": 1.3837348765591453, "kl": 0.034423828125, "learning_rate": 9.672822322997304e-07, "loss": 0.0508, "reward": 2.28125, "reward_std": 1.1752630770206451, "rewards/accuracy_reward_staging": 0.09375, "rewards/format_reward": 0.90625, "rewards/format_reward_staging": 0.90625, "step": 64 }, { "clip_ratio": 0.0, "completion_length": 645.625, "epoch": 16.22222222222222, "grad_norm": 1.2294440183285422, "kl": 0.023651123046875, "learning_rate": 9.657955440256395e-07, "loss": -0.0012, "reward": 2.59375, "reward_std": 1.0483438968658447, "rewards/accuracy_reward_staging": 0.125, "rewards/format_reward": 0.96875, "rewards/format_reward_staging": 1.0, "step": 65 }, { "clip_ratio": 0.0, "completion_length": 593.46875, "epoch": 16.444444444444443, "grad_norm": 1.5888259552277046, "kl": 0.02777099609375, "learning_rate": 9.642770192448535e-07, "loss": 0.0496, "reward": 3.71875, "reward_std": 2.2672154307365417, "rewards/accuracy_reward_staging": 0.40625, "rewards/format_reward": 0.71875, "rewards/format_reward_staging": 0.96875, "step": 66 }, { "clip_ratio": 0.0, "completion_length": 574.875, "epoch": 16.666666666666668, "grad_norm": 1.5785381612535059, "kl": 0.034942626953125, "learning_rate": 9.627267617465243e-07, "loss": -0.0426, "reward": 3.03125, "reward_std": 1.496883064508438, "rewards/accuracy_reward_staging": 0.21875, "rewards/format_reward": 0.9375, "rewards/format_reward_staging": 1.0, "step": 67 }, { "clip_ratio": 0.0, "completion_length": 566.4375, "epoch": 16.88888888888889, "grad_norm": 1.5972037559178247, "kl": 0.026702880859375, "learning_rate": 9.611448774886923e-07, "loss": 0.005, "reward": 3.15625, "reward_std": 1.9091877937316895, "rewards/accuracy_reward_staging": 0.25, "rewards/format_reward": 0.9375, "rewards/format_reward_staging": 0.96875, "step": 68 }, { "clip_ratio": 0.0, "completion_length": 624.21875, "epoch": 17.22222222222222, "grad_norm": 2.781118760006495, "kl": 0.042724609375, "learning_rate": 9.595314745910455e-07, "loss": 0.0926, "reward": 3.3125, "reward_std": 2.4584514498710632, "rewards/accuracy_reward_staging": 0.3125, "rewards/format_reward": 0.8125, "rewards/format_reward_staging": 0.9375, "step": 69 }, { "clip_ratio": 0.0, "completion_length": 545.65625, "epoch": 17.444444444444443, "grad_norm": 1.6678524207695304, "kl": 0.028411865234375, "learning_rate": 9.578866633275286e-07, "loss": 0.0606, "reward": 3.75, "reward_std": 2.237764596939087, "rewards/accuracy_reward_staging": 0.375, "rewards/format_reward": 0.9375, "rewards/format_reward_staging": 0.9375, "step": 70 }, { "clip_ratio": 0.0, "completion_length": 524.0, "epoch": 17.666666666666668, "grad_norm": 1.4163449995088322, "kl": 0.032470703125, "learning_rate": 9.562105561188068e-07, "loss": 0.0105, "reward": 3.40625, "reward_std": 1.9233438968658447, "rewards/accuracy_reward_staging": 0.28125, "rewards/format_reward": 1.0, "rewards/format_reward_staging": 1.0, "step": 71 }, { "clip_ratio": 0.0, "completion_length": 619.8125, "epoch": 17.88888888888889, "grad_norm": 1.330864720677356, "kl": 0.02783203125, "learning_rate": 9.545032675245813e-07, "loss": 0.0232, "reward": 2.875, "reward_std": 1.5208123177289963, "rewards/accuracy_reward_staging": 0.21875, "rewards/format_reward": 0.875, "rewards/format_reward_staging": 0.90625, "step": 72 }, { "clip_ratio": 0.0, "completion_length": 615.5625, "epoch": 18.22222222222222, "grad_norm": 1.3940735985441313, "kl": 0.0289306640625, "learning_rate": 9.527649142357594e-07, "loss": 0.0449, "reward": 4.8125, "reward_std": 3.365248918533325, "rewards/accuracy_reward_staging": 0.59375, "rewards/format_reward": 0.90625, "rewards/format_reward_staging": 0.9375, "step": 73 }, { "clip_ratio": 0.0, "completion_length": 560.6875, "epoch": 18.444444444444443, "grad_norm": 1.573293312721447, "kl": 0.031890869140625, "learning_rate": 9.509956150664795e-07, "loss": 0.0727, "reward": 2.40625, "reward_std": 1.0983919501304626, "rewards/accuracy_reward_staging": 0.125, "rewards/format_reward": 0.78125, "rewards/format_reward_staging": 1.0, "step": 74 }, { "clip_ratio": 0.0, "completion_length": 597.71875, "epoch": 18.666666666666668, "grad_norm": 1.3288952801951834, "kl": 0.028411865234375, "learning_rate": 9.491954909459894e-07, "loss": 0.0299, "reward": 4.125, "reward_std": 2.0565126538276672, "rewards/accuracy_reward_staging": 0.4375, "rewards/format_reward": 0.96875, "rewards/format_reward_staging": 0.96875, "step": 75 }, { "clip_ratio": 0.0, "completion_length": 566.875, "epoch": 18.88888888888889, "grad_norm": 1.600349042443185, "kl": 0.03497314453125, "learning_rate": 9.473646649103817e-07, "loss": 0.0048, "reward": 3.46875, "reward_std": 2.3291621804237366, "rewards/accuracy_reward_staging": 0.3125, "rewards/format_reward": 0.9375, "rewards/format_reward_staging": 0.96875, "step": 76 }, { "clip_ratio": 0.0, "completion_length": 527.15625, "epoch": 19.22222222222222, "grad_norm": 2.0117307258354242, "kl": 0.034820556640625, "learning_rate": 9.455032620941839e-07, "loss": 0.0076, "reward": 3.15625, "reward_std": 1.690910965204239, "rewards/accuracy_reward_staging": 0.25, "rewards/format_reward": 0.9375, "rewards/format_reward_staging": 0.96875, "step": 77 }, { "clip_ratio": 0.0, "completion_length": 548.125, "epoch": 19.444444444444443, "grad_norm": 1.2154400614249532, "kl": 0.034393310546875, "learning_rate": 9.436114097218058e-07, "loss": 0.0153, "reward": 2.34375, "reward_std": 0.9375, "rewards/accuracy_reward_staging": 0.09375, "rewards/format_reward": 0.90625, "rewards/format_reward_staging": 0.96875, "step": 78 }, { "clip_ratio": 0.0, "completion_length": 599.9375, "epoch": 19.666666666666668, "grad_norm": 1.6406138056170174, "kl": 0.029205322265625, "learning_rate": 9.416892370988442e-07, "loss": 0.0752, "reward": 2.75, "reward_std": 1.9128470420837402, "rewards/accuracy_reward_staging": 0.1875, "rewards/format_reward": 0.875, "rewards/format_reward_staging": 0.9375, "step": 79 }, { "clip_ratio": 0.0, "completion_length": 553.6875, "epoch": 19.88888888888889, "grad_norm": 1.527942838909739, "kl": 0.030364990234375, "learning_rate": 9.397368756032444e-07, "loss": -0.0126, "reward": 4.34375, "reward_std": 3.079783648252487, "rewards/accuracy_reward_staging": 0.5, "rewards/format_reward": 0.875, "rewards/format_reward_staging": 0.96875, "step": 80 }, { "clip_ratio": 0.0, "completion_length": 561.53125, "epoch": 20.22222222222222, "grad_norm": 1.6342025185675375, "kl": 0.032318115234375, "learning_rate": 9.377544586763214e-07, "loss": -0.0331, "reward": 4.0625, "reward_std": 2.1620407104492188, "rewards/accuracy_reward_staging": 0.4375, "rewards/format_reward": 0.90625, "rewards/format_reward_staging": 0.96875, "step": 81 }, { "clip_ratio": 0.0, "completion_length": 608.125, "epoch": 20.444444444444443, "grad_norm": 0.9760974984694354, "kl": 0.03082275390625, "learning_rate": 9.357421218136386e-07, "loss": -0.0281, "reward": 2.90625, "reward_std": 1.3726893961429596, "rewards/accuracy_reward_staging": 0.1875, "rewards/format_reward": 0.96875, "rewards/format_reward_staging": 1.0, "step": 82 }, { "clip_ratio": 0.0, "completion_length": 607.1875, "epoch": 20.666666666666668, "grad_norm": 2.9650567483991894, "kl": 0.0552978515625, "learning_rate": 9.337000025557476e-07, "loss": 0.0494, "reward": 2.78125, "reward_std": 1.907078742980957, "rewards/accuracy_reward_staging": 0.21875, "rewards/format_reward": 0.8125, "rewards/format_reward_staging": 0.875, "step": 83 }, { "clip_ratio": 0.0, "completion_length": 648.5625, "epoch": 20.88888888888889, "grad_norm": 1.4868637308996282, "kl": 0.04937744140625, "learning_rate": 9.316282404787869e-07, "loss": 0.0813, "reward": 2.4375, "reward_std": 1.534547746181488, "rewards/accuracy_reward_staging": 0.15625, "rewards/format_reward": 0.78125, "rewards/format_reward_staging": 0.875, "step": 84 }, { "clip_ratio": 0.0, "completion_length": 598.0, "epoch": 21.22222222222222, "grad_norm": 1.5354307812124717, "kl": 0.03326416015625, "learning_rate": 9.295269771849425e-07, "loss": 0.1102, "reward": 3.53125, "reward_std": 2.2993226647377014, "rewards/accuracy_reward_staging": 0.34375, "rewards/format_reward": 0.875, "rewards/format_reward_staging": 0.9375, "step": 85 }, { "clip_ratio": 0.0, "completion_length": 585.25, "epoch": 21.444444444444443, "grad_norm": 1.21306534102283, "kl": 0.03662109375, "learning_rate": 9.273963562927694e-07, "loss": 0.0034, "reward": 2.90625, "reward_std": 1.0483438968658447, "rewards/accuracy_reward_staging": 0.1875, "rewards/format_reward": 0.96875, "rewards/format_reward_staging": 1.0, "step": 86 }, { "clip_ratio": 0.0, "completion_length": 568.71875, "epoch": 21.666666666666668, "grad_norm": 11.152096799903676, "kl": 0.09722900390625, "learning_rate": 9.252365234273753e-07, "loss": 0.0125, "reward": 3.1875, "reward_std": 1.9283326417207718, "rewards/accuracy_reward_staging": 0.28125, "rewards/format_reward": 0.8125, "rewards/format_reward_staging": 0.96875, "step": 87 }, { "clip_ratio": 0.0, "completion_length": 567.0625, "epoch": 21.88888888888889, "grad_norm": 1.4820021533223564, "kl": 0.04046630859375, "learning_rate": 9.230476262104676e-07, "loss": 0.0631, "reward": 3.40625, "reward_std": 2.233847141265869, "rewards/accuracy_reward_staging": 0.3125, "rewards/format_reward": 0.84375, "rewards/format_reward_staging": 1.0, "step": 88 }, { "clip_ratio": 0.0, "completion_length": 571.625, "epoch": 22.22222222222222, "grad_norm": 1.6685149630374954, "kl": 0.04840087890625, "learning_rate": 9.208298142502635e-07, "loss": 0.057, "reward": 2.90625, "reward_std": 1.7658206820487976, "rewards/accuracy_reward_staging": 0.21875, "rewards/format_reward": 0.84375, "rewards/format_reward_staging": 0.96875, "step": 89 }, { "clip_ratio": 0.0, "completion_length": 652.0, "epoch": 22.444444444444443, "grad_norm": 1.2598669750057847, "kl": 0.037872314453125, "learning_rate": 9.185832391312642e-07, "loss": 0.0397, "reward": 2.1875, "reward_std": 1.0936830341815948, "rewards/accuracy_reward_staging": 0.09375, "rewards/format_reward": 0.875, "rewards/format_reward_staging": 0.84375, "step": 90 }, { "clip_ratio": 0.0, "completion_length": 512.84375, "epoch": 22.666666666666668, "grad_norm": 1.464116156191612, "kl": 0.0408935546875, "learning_rate": 9.163080544038952e-07, "loss": 0.0325, "reward": 3.1875, "reward_std": 2.0054054260253906, "rewards/accuracy_reward_staging": 0.28125, "rewards/format_reward": 0.84375, "rewards/format_reward_staging": 0.9375, "step": 91 }, { "clip_ratio": 0.0, "completion_length": 595.0, "epoch": 22.88888888888889, "grad_norm": 1.7502771652549964, "kl": 0.0543212890625, "learning_rate": 9.1400441557401e-07, "loss": 0.1198, "reward": 4.375, "reward_std": 2.6551371216773987, "rewards/accuracy_reward_staging": 0.53125, "rewards/format_reward": 0.8125, "rewards/format_reward_staging": 0.90625, "step": 92 }, { "clip_ratio": 0.0, "completion_length": 575.375, "epoch": 23.22222222222222, "grad_norm": 1.5494132503619473, "kl": 0.04376220703125, "learning_rate": 9.116724800922629e-07, "loss": 0.1098, "reward": 3.6875, "reward_std": 1.9493454694747925, "rewards/accuracy_reward_staging": 0.375, "rewards/format_reward": 0.875, "rewards/format_reward_staging": 0.9375, "step": 93 }, { "clip_ratio": 0.0, "completion_length": 577.5, "epoch": 23.444444444444443, "grad_norm": 1.2511045169588764, "kl": 0.0521240234375, "learning_rate": 9.093124073433462e-07, "loss": 0.0389, "reward": 3.5625, "reward_std": 2.1182020902633667, "rewards/accuracy_reward_staging": 0.34375, "rewards/format_reward": 0.84375, "rewards/format_reward_staging": 1.0, "step": 94 }, { "clip_ratio": 0.0, "completion_length": 590.78125, "epoch": 23.666666666666668, "grad_norm": 1.5974928179741261, "kl": 0.045074462890625, "learning_rate": 9.069243586350975e-07, "loss": -0.0127, "reward": 4.09375, "reward_std": 2.1429253816604614, "rewards/accuracy_reward_staging": 0.4375, "rewards/format_reward": 0.90625, "rewards/format_reward_staging": 1.0, "step": 95 }, { "clip_ratio": 0.0, "completion_length": 454.1875, "epoch": 23.88888888888889, "grad_norm": 1.885519118261372, "kl": 0.0450439453125, "learning_rate": 9.045084971874737e-07, "loss": 0.0469, "reward": 4.0625, "reward_std": 2.76924729347229, "rewards/accuracy_reward_staging": 0.4375, "rewards/format_reward": 0.90625, "rewards/format_reward_staging": 0.96875, "step": 96 }, { "clip_ratio": 0.0, "completion_length": 538.0625, "epoch": 24.22222222222222, "grad_norm": 1.5682355592026038, "kl": 0.05267333984375, "learning_rate": 9.020649881213958e-07, "loss": 0.0061, "reward": 3.40625, "reward_std": 2.1967990398406982, "rewards/accuracy_reward_staging": 0.3125, "rewards/format_reward": 0.90625, "rewards/format_reward_staging": 0.9375, "step": 97 }, { "clip_ratio": 0.0, "completion_length": 632.75, "epoch": 24.444444444444443, "grad_norm": 1.2736403946455588, "kl": 0.044189453125, "learning_rate": 8.995939984474623e-07, "loss": 0.0172, "reward": 3.84375, "reward_std": 2.4564297795295715, "rewards/accuracy_reward_staging": 0.40625, "rewards/format_reward": 0.8125, "rewards/format_reward_staging": 1.0, "step": 98 }, { "clip_ratio": 0.0, "completion_length": 582.90625, "epoch": 24.666666666666668, "grad_norm": 1.5123549273068009, "kl": 0.04638671875, "learning_rate": 8.970956970545355e-07, "loss": 0.0662, "reward": 3.78125, "reward_std": 2.7111909985542297, "rewards/accuracy_reward_staging": 0.375, "rewards/format_reward": 0.9375, "rewards/format_reward_staging": 0.96875, "step": 99 }, { "epoch": 24.88888888888889, "grad_norm": 1.7849471498217702, "learning_rate": 8.945702546981968e-07, "loss": 0.142, "step": 100 }, { "epoch": 24.88888888888889, "eval_clip_ratio": 0.0, "eval_completion_length": 511.125, "eval_kl": 0.073193359375, "eval_loss": -0.007530718110501766, "eval_reward": 2.075, "eval_reward_std": 0.6665439963340759, "eval_rewards/accuracy_reward_staging": 0.05, "eval_rewards/format_reward": 0.85, "eval_rewards/format_reward_staging": 0.975, "eval_runtime": 50.3514, "eval_samples_per_second": 0.715, "eval_steps_per_second": 0.099, "step": 100 }, { "clip_ratio": 0.0, "completion_length": 581.21875, "epoch": 25.22222222222222, "grad_norm": 1.8652012487452174, "kl": 0.05792236328125, "learning_rate": 8.920178439890764e-07, "loss": 0.0112, "reward": 3.46875, "reward_std": 1.8295301795005798, "rewards/accuracy_reward_staging": 0.328125, "rewards/format_reward": 0.890625, "rewards/format_reward_staging": 0.9375, "step": 101 }, { "clip_ratio": 0.0, "completion_length": 597.875, "epoch": 25.444444444444443, "grad_norm": 4.84795988570716, "kl": 0.06231689453125, "learning_rate": 8.894386393810562e-07, "loss": 0.0844, "reward": 2.875, "reward_std": 1.6470783054828644, "rewards/accuracy_reward_staging": 0.21875, "rewards/format_reward": 0.8125, "rewards/format_reward_staging": 0.96875, "step": 102 }, { "clip_ratio": 0.0, "completion_length": 562.5, "epoch": 25.666666666666668, "grad_norm": 1.8754521848828094, "kl": 0.052978515625, "learning_rate": 8.868328171593446e-07, "loss": -0.0154, "reward": 4.25, "reward_std": 2.547704756259918, "rewards/accuracy_reward_staging": 0.46875, "rewards/format_reward": 0.90625, "rewards/format_reward_staging": 1.0, "step": 103 }, { "clip_ratio": 0.0, "completion_length": 537.3125, "epoch": 25.88888888888889, "grad_norm": 1.797756724546587, "kl": 0.05206298828125, "learning_rate": 8.842005554284295e-07, "loss": -0.0275, "reward": 3.84375, "reward_std": 2.51630362868309, "rewards/accuracy_reward_staging": 0.40625, "rewards/format_reward": 0.90625, "rewards/format_reward_staging": 0.90625, "step": 104 }, { "clip_ratio": 0.0, "completion_length": 622.53125, "epoch": 26.22222222222222, "grad_norm": 1.300955660750681, "kl": 0.05413818359375, "learning_rate": 8.815420340999033e-07, "loss": 0.0637, "reward": 3.84375, "reward_std": 1.3620327413082123, "rewards/accuracy_reward_staging": 0.40625, "rewards/format_reward": 0.875, "rewards/format_reward_staging": 0.9375, "step": 105 }, { "clip_ratio": 0.0, "completion_length": 594.25, "epoch": 26.444444444444443, "grad_norm": 1.43085940167237, "kl": 0.0439453125, "learning_rate": 8.788574348801674e-07, "loss": 0.0768, "reward": 4.625, "reward_std": 1.9858438968658447, "rewards/accuracy_reward_staging": 0.53125, "rewards/format_reward": 0.96875, "rewards/format_reward_staging": 1.0, "step": 106 }, { "clip_ratio": 0.0, "completion_length": 582.15625, "epoch": 26.666666666666668, "grad_norm": 1.6135777138066925, "kl": 0.06390380859375, "learning_rate": 8.761469412580124e-07, "loss": 0.0142, "reward": 1.96875, "reward_std": 1.00966876745224, "rewards/accuracy_reward_staging": 0.0625, "rewards/format_reward": 0.78125, "rewards/format_reward_staging": 0.875, "step": 107 }, { "clip_ratio": 0.0, "completion_length": 587.0625, "epoch": 26.88888888888889, "grad_norm": 2.1207393888337887, "kl": 0.06134033203125, "learning_rate": 8.734107384920769e-07, "loss": 0.0242, "reward": 4.125, "reward_std": 3.0213340520858765, "rewards/accuracy_reward_staging": 0.4375, "rewards/format_reward": 0.9375, "rewards/format_reward_staging": 1.0, "step": 108 }, { "clip_ratio": 0.0, "completion_length": 548.40625, "epoch": 27.22222222222222, "grad_norm": 1.5591626897717148, "kl": 0.0465087890625, "learning_rate": 8.706490135981855e-07, "loss": -0.0282, "reward": 4.5625, "reward_std": 2.360237419605255, "rewards/accuracy_reward_staging": 0.53125, "rewards/format_reward": 0.9375, "rewards/format_reward_staging": 0.96875, "step": 109 }, { "clip_ratio": 0.0, "completion_length": 598.8125, "epoch": 27.444444444444443, "grad_norm": 1.1645802297935302, "kl": 0.04632568359375, "learning_rate": 8.678619553365658e-07, "loss": -0.0278, "reward": 3.21875, "reward_std": 1.8432062864303589, "rewards/accuracy_reward_staging": 0.25, "rewards/format_reward": 0.96875, "rewards/format_reward_staging": 1.0, "step": 110 }, { "clip_ratio": 0.0, "completion_length": 610.21875, "epoch": 27.666666666666668, "grad_norm": 1.7617563181087859, "kl": 0.0550537109375, "learning_rate": 8.650497541989481e-07, "loss": -0.0219, "reward": 2.84375, "reward_std": 1.7233919501304626, "rewards/accuracy_reward_staging": 0.1875, "rewards/format_reward": 0.9375, "rewards/format_reward_staging": 0.96875, "step": 111 }, { "clip_ratio": 0.0, "completion_length": 602.5, "epoch": 27.88888888888889, "grad_norm": 1.4790592908519822, "kl": 0.04412841796875, "learning_rate": 8.622126023955445e-07, "loss": 0.0624, "reward": 3.65625, "reward_std": 2.0483438968658447, "rewards/accuracy_reward_staging": 0.34375, "rewards/format_reward": 0.96875, "rewards/format_reward_staging": 0.96875, "step": 112 }, { "clip_ratio": 0.0, "completion_length": 602.21875, "epoch": 28.22222222222222, "grad_norm": 1.4599487812585739, "kl": 0.0484619140625, "learning_rate": 8.593506938419119e-07, "loss": 0.0459, "reward": 3.84375, "reward_std": 0.9925079494714737, "rewards/accuracy_reward_staging": 0.40625, "rewards/format_reward": 0.84375, "rewards/format_reward_staging": 0.96875, "step": 113 }, { "clip_ratio": 0.0, "completion_length": 569.71875, "epoch": 28.444444444444443, "grad_norm": 1.3832144010184737, "kl": 0.0467529296875, "learning_rate": 8.564642241456986e-07, "loss": 0.0025, "reward": 3.71875, "reward_std": 1.9233438968658447, "rewards/accuracy_reward_staging": 0.34375, "rewards/format_reward": 1.0, "rewards/format_reward_staging": 1.0, "step": 114 }, { "clip_ratio": 0.0, "completion_length": 548.78125, "epoch": 28.666666666666668, "grad_norm": 1.803131512178923, "kl": 0.0576171875, "learning_rate": 8.535533905932737e-07, "loss": -0.0187, "reward": 3.90625, "reward_std": 2.563826858997345, "rewards/accuracy_reward_staging": 0.4375, "rewards/format_reward": 0.75, "rewards/format_reward_staging": 0.96875, "step": 115 }, { "clip_ratio": 0.0, "completion_length": 564.4375, "epoch": 28.88888888888889, "grad_norm": 1.6402368950584498, "kl": 0.05029296875, "learning_rate": 8.506183921362442e-07, "loss": -0.0174, "reward": 3.3125, "reward_std": 2.5466037690639496, "rewards/accuracy_reward_staging": 0.28125, "rewards/format_reward": 0.9375, "rewards/format_reward_staging": 0.96875, "step": 116 }, { "clip_ratio": 0.0, "completion_length": 563.5625, "epoch": 29.22222222222222, "grad_norm": 1.4985773194128882, "kl": 0.04840087890625, "learning_rate": 8.47659429377856e-07, "loss": -0.0153, "reward": 3.875, "reward_std": 2.3320942521095276, "rewards/accuracy_reward_staging": 0.375, "rewards/format_reward": 1.0, "rewards/format_reward_staging": 1.0, "step": 117 }, { "clip_ratio": 0.0, "completion_length": 565.34375, "epoch": 29.444444444444443, "grad_norm": 1.7501844147476033, "kl": 0.05194091796875, "learning_rate": 8.446767045592829e-07, "loss": 0.0359, "reward": 3.84375, "reward_std": 2.3963494896888733, "rewards/accuracy_reward_staging": 0.40625, "rewards/format_reward": 0.84375, "rewards/format_reward_staging": 0.96875, "step": 118 }, { "clip_ratio": 0.0, "completion_length": 574.5, "epoch": 29.666666666666668, "grad_norm": 1.2571401212163673, "kl": 0.0498046875, "learning_rate": 8.416704215458042e-07, "loss": 0.0187, "reward": 3.3125, "reward_std": 1.125, "rewards/accuracy_reward_staging": 0.28125, "rewards/format_reward": 0.96875, "rewards/format_reward_staging": 0.9375, "step": 119 }, { "clip_ratio": 0.0, "completion_length": 597.25, "epoch": 29.88888888888889, "grad_norm": 1.2235795288016953, "kl": 0.04754638671875, "learning_rate": 8.386407858128706e-07, "loss": -0.0144, "reward": 3.25, "reward_std": 1.5358919501304626, "rewards/accuracy_reward_staging": 0.25, "rewards/format_reward": 1.0, "rewards/format_reward_staging": 1.0, "step": 120 }, { "clip_ratio": 0.0, "completion_length": 597.15625, "epoch": 30.22222222222222, "grad_norm": 1.6274382257749778, "kl": 0.060791015625, "learning_rate": 8.355880044320597e-07, "loss": 0.0121, "reward": 3.34375, "reward_std": 2.7569093704223633, "rewards/accuracy_reward_staging": 0.34375, "rewards/format_reward": 0.78125, "rewards/format_reward_staging": 0.84375, "step": 121 }, { "clip_ratio": 0.0, "completion_length": 592.8125, "epoch": 30.444444444444443, "grad_norm": 2.5186927220968895, "kl": 0.09588623046875, "learning_rate": 8.325122860569241e-07, "loss": 0.0081, "reward": 3.15625, "reward_std": 2.1270195841789246, "rewards/accuracy_reward_staging": 0.28125, "rewards/format_reward": 0.875, "rewards/format_reward_staging": 0.875, "step": 122 }, { "clip_ratio": 0.0, "completion_length": 564.1875, "epoch": 30.666666666666668, "grad_norm": 1.4932442148368137, "kl": 0.04656982421875, "learning_rate": 8.294138409087289e-07, "loss": 0.0298, "reward": 3.625, "reward_std": 2.008278489112854, "rewards/accuracy_reward_staging": 0.34375, "rewards/format_reward": 0.90625, "rewards/format_reward_staging": 1.0, "step": 123 }, { "clip_ratio": 0.0, "completion_length": 607.375, "epoch": 30.88888888888889, "grad_norm": 3.4718877576698746, "kl": 0.076904296875, "learning_rate": 8.262928807620843e-07, "loss": -0.0234, "reward": 3.6875, "reward_std": 2.751339912414551, "rewards/accuracy_reward_staging": 0.375, "rewards/format_reward": 0.90625, "rewards/format_reward_staging": 0.90625, "step": 124 }, { "clip_ratio": 0.0, "completion_length": 531.5625, "epoch": 31.22222222222222, "grad_norm": 1.622119125741056, "kl": 0.05914306640625, "learning_rate": 8.231496189304704e-07, "loss": 0.0119, "reward": 3.78125, "reward_std": 1.9775724411010742, "rewards/accuracy_reward_staging": 0.375, "rewards/format_reward": 0.90625, "rewards/format_reward_staging": 1.0, "step": 125 }, { "clip_ratio": 0.0, "completion_length": 645.84375, "epoch": 31.444444444444443, "grad_norm": 1.6061164218143151, "kl": 0.0496826171875, "learning_rate": 8.199842702516582e-07, "loss": 0.0355, "reward": 3.90625, "reward_std": 2.5803541243076324, "rewards/accuracy_reward_staging": 0.4375, "rewards/format_reward": 0.78125, "rewards/format_reward_staging": 0.9375, "step": 126 }, { "clip_ratio": 0.0, "completion_length": 597.3125, "epoch": 31.666666666666668, "grad_norm": 1.3457598005679037, "kl": 0.0526123046875, "learning_rate": 8.167970510730252e-07, "loss": -0.0134, "reward": 3.15625, "reward_std": 1.8007422089576721, "rewards/accuracy_reward_staging": 0.25, "rewards/format_reward": 0.9375, "rewards/format_reward_staging": 0.96875, "step": 127 }, { "clip_ratio": 0.0, "completion_length": 640.25, "epoch": 31.88888888888889, "grad_norm": 1.5569181185599603, "kl": 0.058349609375, "learning_rate": 8.135881792367685e-07, "loss": -0.0192, "reward": 3.59375, "reward_std": 1.5271694660186768, "rewards/accuracy_reward_staging": 0.34375, "rewards/format_reward": 0.875, "rewards/format_reward_staging": 1.0, "step": 128 }, { "clip_ratio": 0.0, "completion_length": 574.34375, "epoch": 32.22222222222222, "grad_norm": 1.6790409041925978, "kl": 0.05426025390625, "learning_rate": 8.103578740650156e-07, "loss": -0.0013, "reward": 3.8125, "reward_std": 2.151860535144806, "rewards/accuracy_reward_staging": 0.375, "rewards/format_reward": 0.9375, "rewards/format_reward_staging": 1.0, "step": 129 }, { "clip_ratio": 0.0, "completion_length": 568.0, "epoch": 32.44444444444444, "grad_norm": 1.7164186713447234, "kl": 0.0628662109375, "learning_rate": 8.071063563448339e-07, "loss": 0.0355, "reward": 3.09375, "reward_std": 2.110320746898651, "rewards/accuracy_reward_staging": 0.28125, "rewards/format_reward": 0.78125, "rewards/format_reward_staging": 0.90625, "step": 130 }, { "clip_ratio": 0.0, "completion_length": 612.125, "epoch": 32.666666666666664, "grad_norm": 1.4484213626473657, "kl": 0.0438232421875, "learning_rate": 8.038338483131406e-07, "loss": 0.0675, "reward": 2.65625, "reward_std": 1.5483438968658447, "rewards/accuracy_reward_staging": 0.15625, "rewards/format_reward": 0.90625, "rewards/format_reward_staging": 0.96875, "step": 131 }, { "clip_ratio": 0.0, "completion_length": 633.375, "epoch": 32.888888888888886, "grad_norm": 1.4888928164051263, "kl": 0.046630859375, "learning_rate": 8.005405736415125e-07, "loss": 0.003, "reward": 3.5625, "reward_std": 2.257579743862152, "rewards/accuracy_reward_staging": 0.3125, "rewards/format_reward": 1.0, "rewards/format_reward_staging": 1.0, "step": 132 }, { "clip_ratio": 0.0, "completion_length": 608.28125, "epoch": 33.22222222222222, "grad_norm": 1.4537634594396451, "kl": 0.05352783203125, "learning_rate": 7.97226757420899e-07, "loss": 0.0072, "reward": 4.53125, "reward_std": 2.650395154953003, "rewards/accuracy_reward_staging": 0.53125, "rewards/format_reward": 0.90625, "rewards/format_reward_staging": 0.96875, "step": 133 }, { "clip_ratio": 0.0, "completion_length": 589.625, "epoch": 33.44444444444444, "grad_norm": 5.103167634384414, "kl": 0.107421875, "learning_rate": 7.938926261462365e-07, "loss": 0.0303, "reward": 3.96875, "reward_std": 1.4233438968658447, "rewards/accuracy_reward_staging": 0.40625, "rewards/format_reward": 0.9375, "rewards/format_reward_staging": 1.0, "step": 134 }, { "clip_ratio": 0.0, "completion_length": 576.90625, "epoch": 33.666666666666664, "grad_norm": 5.13739196509469, "kl": 0.09185791015625, "learning_rate": 7.905384077009692e-07, "loss": 0.0254, "reward": 3.40625, "reward_std": 2.5271694660186768, "rewards/accuracy_reward_staging": 0.3125, "rewards/format_reward": 0.875, "rewards/format_reward_staging": 0.96875, "step": 135 }, { "clip_ratio": 0.0, "completion_length": 565.0, "epoch": 33.888888888888886, "grad_norm": 1.3347218031999781, "kl": 0.05279541015625, "learning_rate": 7.871643313414718e-07, "loss": -0.0269, "reward": 3.78125, "reward_std": 1.9108592867851257, "rewards/accuracy_reward_staging": 0.375, "rewards/format_reward": 0.9375, "rewards/format_reward_staging": 0.96875, "step": 136 }, { "clip_ratio": 0.0, "completion_length": 596.96875, "epoch": 34.22222222222222, "grad_norm": 1.6203773256898213, "kl": 0.05377197265625, "learning_rate": 7.837706276813818e-07, "loss": -0.0507, "reward": 3.78125, "reward_std": 2.8475868701934814, "rewards/accuracy_reward_staging": 0.40625, "rewards/format_reward": 0.84375, "rewards/format_reward_staging": 0.90625, "step": 137 }, { "clip_ratio": 0.0, "completion_length": 548.90625, "epoch": 34.44444444444444, "grad_norm": 1.7589228637659193, "kl": 0.0518798828125, "learning_rate": 7.803575286758363e-07, "loss": 0.0256, "reward": 3.84375, "reward_std": 2.3770764470100403, "rewards/accuracy_reward_staging": 0.40625, "rewards/format_reward": 0.875, "rewards/format_reward_staging": 0.9375, "step": 138 }, { "clip_ratio": 0.0, "completion_length": 574.90625, "epoch": 34.666666666666664, "grad_norm": 1.465848261824115, "kl": 0.05047607421875, "learning_rate": 7.769252676056186e-07, "loss": 0.0121, "reward": 3.0, "reward_std": 1.999484658241272, "rewards/accuracy_reward_staging": 0.21875, "rewards/format_reward": 0.9375, "rewards/format_reward_staging": 0.96875, "step": 139 }, { "clip_ratio": 0.0, "completion_length": 621.5625, "epoch": 34.888888888888886, "grad_norm": 1.699502675045011, "kl": 0.04669189453125, "learning_rate": 7.734740790612136e-07, "loss": -0.0043, "reward": 3.65625, "reward_std": 2.740947127342224, "rewards/accuracy_reward_staging": 0.40625, "rewards/format_reward": 0.78125, "rewards/format_reward_staging": 0.84375, "step": 140 }, { "clip_ratio": 0.0, "completion_length": 575.03125, "epoch": 35.22222222222222, "grad_norm": 1.4180294898308454, "kl": 0.04791259765625, "learning_rate": 7.700041989267736e-07, "loss": 0.0128, "reward": 3.9375, "reward_std": 1.6851893663406372, "rewards/accuracy_reward_staging": 0.40625, "rewards/format_reward": 0.90625, "rewards/format_reward_staging": 1.0, "step": 141 }, { "clip_ratio": 0.0, "completion_length": 634.3125, "epoch": 35.44444444444444, "grad_norm": 0.97669552258444, "kl": 0.04840087890625, "learning_rate": 7.665158643639969e-07, "loss": 0.0078, "reward": 3.90625, "reward_std": 1.2753951847553253, "rewards/accuracy_reward_staging": 0.40625, "rewards/format_reward": 0.875, "rewards/format_reward_staging": 1.0, "step": 142 }, { "clip_ratio": 0.0, "completion_length": 564.9375, "epoch": 35.666666666666664, "grad_norm": 1.4705421421024347, "kl": 0.0458984375, "learning_rate": 7.63009313795917e-07, "loss": 0.0007, "reward": 3.375, "reward_std": 1.9858438968658447, "rewards/accuracy_reward_staging": 0.28125, "rewards/format_reward": 0.96875, "rewards/format_reward_staging": 1.0, "step": 143 }, { "clip_ratio": 0.0, "completion_length": 630.8125, "epoch": 35.888888888888886, "grad_norm": 1.4040857696410018, "kl": 0.0491943359375, "learning_rate": 7.594847868906076e-07, "loss": 0.0157, "reward": 4.53125, "reward_std": 1.881795346736908, "rewards/accuracy_reward_staging": 0.5625, "rewards/format_reward": 0.78125, "rewards/format_reward_staging": 0.9375, "step": 144 }, { "clip_ratio": 0.0, "completion_length": 597.4375, "epoch": 36.22222222222222, "grad_norm": 1.7416315495447303, "kl": 0.05291748046875, "learning_rate": 7.559425245448005e-07, "loss": 0.1534, "reward": 4.125, "reward_std": 1.7268692255020142, "rewards/accuracy_reward_staging": 0.46875, "rewards/format_reward": 0.875, "rewards/format_reward_staging": 0.90625, "step": 145 }, { "clip_ratio": 0.0, "completion_length": 543.34375, "epoch": 36.44444444444444, "grad_norm": 1.3338618690781434, "kl": 0.05255126953125, "learning_rate": 7.523827688674219e-07, "loss": 0.0048, "reward": 3.46875, "reward_std": 1.7618454992771149, "rewards/accuracy_reward_staging": 0.3125, "rewards/format_reward": 0.9375, "rewards/format_reward_staging": 0.96875, "step": 146 }, { "clip_ratio": 0.0, "completion_length": 579.625, "epoch": 36.666666666666664, "grad_norm": 1.8245501253344487, "kl": 0.04931640625, "learning_rate": 7.488057631630437e-07, "loss": 0.0975, "reward": 3.78125, "reward_std": 2.0842358469963074, "rewards/accuracy_reward_staging": 0.375, "rewards/format_reward": 0.90625, "rewards/format_reward_staging": 1.0, "step": 147 }, { "clip_ratio": 0.0, "completion_length": 556.25, "epoch": 36.888888888888886, "grad_norm": 1.60039839729214, "kl": 0.0479736328125, "learning_rate": 7.452117519152541e-07, "loss": -0.0225, "reward": 4.75, "reward_std": 2.8358521461486816, "rewards/accuracy_reward_staging": 0.59375, "rewards/format_reward": 0.84375, "rewards/format_reward_staging": 0.9375, "step": 148 }, { "clip_ratio": 0.0, "completion_length": 575.59375, "epoch": 37.22222222222222, "grad_norm": 1.9262708460562594, "kl": 0.04852294921875, "learning_rate": 7.416009807699481e-07, "loss": 0.0694, "reward": 3.875, "reward_std": 2.4488722383975983, "rewards/accuracy_reward_staging": 0.40625, "rewards/format_reward": 0.90625, "rewards/format_reward_staging": 0.9375, "step": 149 }, { "epoch": 37.44444444444444, "grad_norm": 1.4126152849193538, "learning_rate": 7.379736965185368e-07, "loss": 0.0461, "step": 150 }, { "epoch": 37.44444444444444, "eval_clip_ratio": 0.0, "eval_completion_length": 583.05, "eval_kl": 0.045751953125, "eval_loss": -0.002990193199366331, "eval_reward": 2.725, "eval_reward_std": 1.3047046661376953, "eval_rewards/accuracy_reward_staging": 0.175, "eval_rewards/format_reward": 0.875, "eval_rewards/format_reward_staging": 0.975, "eval_runtime": 52.1348, "eval_samples_per_second": 0.691, "eval_steps_per_second": 0.096, "step": 150 }, { "clip_ratio": 0.0, "completion_length": 566.234375, "epoch": 37.666666666666664, "grad_norm": 1.6128745248404066, "kl": 0.0498046875, "learning_rate": 7.343301470810807e-07, "loss": 0.0205, "reward": 3.8125, "reward_std": 2.2092738151550293, "rewards/accuracy_reward_staging": 0.375, "rewards/format_reward": 0.953125, "rewards/format_reward_staging": 0.984375, "step": 151 }, { "clip_ratio": 0.0, "completion_length": 561.6875, "epoch": 37.888888888888886, "grad_norm": 1.6630192300364095, "kl": 0.051513671875, "learning_rate": 7.306705814893439e-07, "loss": 0.0613, "reward": 4.75, "reward_std": 3.510585069656372, "rewards/accuracy_reward_staging": 0.5625, "rewards/format_reward": 0.96875, "rewards/format_reward_staging": 0.96875, "step": 152 }, { "clip_ratio": 0.0, "completion_length": 587.40625, "epoch": 38.22222222222222, "grad_norm": 1.5821951322432892, "kl": 0.0535888671875, "learning_rate": 7.269952498697734e-07, "loss": 0.0053, "reward": 3.78125, "reward_std": 2.4141127467155457, "rewards/accuracy_reward_staging": 0.4375, "rewards/format_reward": 0.75, "rewards/format_reward_staging": 0.84375, "step": 153 }, { "clip_ratio": 0.0, "completion_length": 572.6875, "epoch": 38.44444444444444, "grad_norm": 2.2424195208633453, "kl": 0.07366943359375, "learning_rate": 7.233044034264033e-07, "loss": 0.0315, "reward": 3.84375, "reward_std": 2.3134855031967163, "rewards/accuracy_reward_staging": 0.4375, "rewards/format_reward": 0.75, "rewards/format_reward_staging": 0.90625, "step": 154 }, { "clip_ratio": 0.0, "completion_length": 555.9375, "epoch": 38.666666666666664, "grad_norm": 1.529067187866317, "kl": 0.05157470703125, "learning_rate": 7.195982944236852e-07, "loss": 0.0321, "reward": 2.8125, "reward_std": 1.796603798866272, "rewards/accuracy_reward_staging": 0.1875, "rewards/format_reward": 0.9375, "rewards/format_reward_staging": 0.9375, "step": 155 }, { "clip_ratio": 0.0, "completion_length": 630.0, "epoch": 38.888888888888886, "grad_norm": 1.579548286063579, "kl": 0.050537109375, "learning_rate": 7.158771761692464e-07, "loss": 0.0309, "reward": 4.28125, "reward_std": 2.8335397839546204, "rewards/accuracy_reward_staging": 0.5, "rewards/format_reward": 0.875, "rewards/format_reward_staging": 0.90625, "step": 156 }, { "clip_ratio": 0.0, "completion_length": 563.0625, "epoch": 39.22222222222222, "grad_norm": 1.4963033786464435, "kl": 0.050048828125, "learning_rate": 7.121413029965769e-07, "loss": 0.0482, "reward": 3.8125, "reward_std": 2.3843142986297607, "rewards/accuracy_reward_staging": 0.40625, "rewards/format_reward": 0.84375, "rewards/format_reward_staging": 0.9375, "step": 157 }, { "clip_ratio": 0.0, "completion_length": 576.59375, "epoch": 39.44444444444444, "grad_norm": 1.4775879396602463, "kl": 0.054443359375, "learning_rate": 7.083909302476452e-07, "loss": 0.0164, "reward": 3.71875, "reward_std": 1.9704924821853638, "rewards/accuracy_reward_staging": 0.40625, "rewards/format_reward": 0.78125, "rewards/format_reward_staging": 0.90625, "step": 158 }, { "clip_ratio": 0.0, "completion_length": 555.8125, "epoch": 39.666666666666664, "grad_norm": 1.7649876199956425, "kl": 0.0699462890625, "learning_rate": 7.04626314255447e-07, "loss": 0.0019, "reward": 4.4375, "reward_std": 2.7981574535369873, "rewards/accuracy_reward_staging": 0.5, "rewards/format_reward": 0.9375, "rewards/format_reward_staging": 1.0, "step": 159 }, { "clip_ratio": 0.0, "completion_length": 498.75, "epoch": 39.888888888888886, "grad_norm": 1.3915513369029784, "kl": 0.0543212890625, "learning_rate": 7.008477123264847e-07, "loss": 0.0433, "reward": 2.90625, "reward_std": 1.3342358469963074, "rewards/accuracy_reward_staging": 0.1875, "rewards/format_reward": 0.96875, "rewards/format_reward_staging": 1.0, "step": 160 }, { "clip_ratio": 0.0, "completion_length": 569.53125, "epoch": 40.22222222222222, "grad_norm": 1.562840542671513, "kl": 0.053955078125, "learning_rate": 6.970553827231808e-07, "loss": 0.0164, "reward": 4.625, "reward_std": 2.55762779712677, "rewards/accuracy_reward_staging": 0.53125, "rewards/format_reward": 0.96875, "rewards/format_reward_staging": 1.0, "step": 161 }, { "clip_ratio": 0.0, "completion_length": 538.3125, "epoch": 40.44444444444444, "grad_norm": 1.4692239574350316, "kl": 0.0526123046875, "learning_rate": 6.932495846462261e-07, "loss": -0.0164, "reward": 3.65625, "reward_std": 1.8189646005630493, "rewards/accuracy_reward_staging": 0.34375, "rewards/format_reward": 0.96875, "rewards/format_reward_staging": 0.96875, "step": 162 }, { "clip_ratio": 0.0, "completion_length": 531.875, "epoch": 40.666666666666664, "grad_norm": 1.5332016106483515, "kl": 0.05316162109375, "learning_rate": 6.894305782168638e-07, "loss": -0.0429, "reward": 4.3125, "reward_std": 2.5211293697357178, "rewards/accuracy_reward_staging": 0.46875, "rewards/format_reward": 0.96875, "rewards/format_reward_staging": 1.0, "step": 163 }, { "clip_ratio": 0.0, "completion_length": 545.125, "epoch": 40.888888888888886, "grad_norm": 13.136996472534078, "kl": 0.11846923828125, "learning_rate": 6.855986244591103e-07, "loss": -0.0235, "reward": 3.28125, "reward_std": 2.338345527648926, "rewards/accuracy_reward_staging": 0.3125, "rewards/format_reward": 0.8125, "rewards/format_reward_staging": 0.90625, "step": 164 }, { "clip_ratio": 0.0, "completion_length": 543.40625, "epoch": 41.22222222222222, "grad_norm": 1.3242743159265937, "kl": 0.04998779296875, "learning_rate": 6.817539852819148e-07, "loss": 0.0115, "reward": 3.1875, "reward_std": 1.375, "rewards/accuracy_reward_staging": 0.25, "rewards/format_reward": 0.96875, "rewards/format_reward_staging": 0.96875, "step": 165 }, { "clip_ratio": 0.0, "completion_length": 520.1875, "epoch": 41.44444444444444, "grad_norm": 1.2491437366023406, "kl": 0.05328369140625, "learning_rate": 6.778969234612583e-07, "loss": 0.0198, "reward": 4.84375, "reward_std": 1.7444601655006409, "rewards/accuracy_reward_staging": 0.59375, "rewards/format_reward": 0.90625, "rewards/format_reward_staging": 0.96875, "step": 166 }, { "clip_ratio": 0.0, "completion_length": 619.59375, "epoch": 41.666666666666664, "grad_norm": 1.6792411977674075, "kl": 0.05487060546875, "learning_rate": 6.740277026221922e-07, "loss": 0.011, "reward": 3.21875, "reward_std": 2.509488582611084, "rewards/accuracy_reward_staging": 0.28125, "rewards/format_reward": 0.84375, "rewards/format_reward_staging": 0.96875, "step": 167 }, { "clip_ratio": 0.0, "completion_length": 539.125, "epoch": 41.888888888888886, "grad_norm": 2.61987913810964, "kl": 0.08526611328125, "learning_rate": 6.701465872208216e-07, "loss": 0.0355, "reward": 5.71875, "reward_std": 2.992280900478363, "rewards/accuracy_reward_staging": 0.78125, "rewards/format_reward": 0.8125, "rewards/format_reward_staging": 1.0, "step": 168 }, { "clip_ratio": 0.0, "completion_length": 601.65625, "epoch": 42.22222222222222, "grad_norm": 1.5317010015458066, "kl": 0.0543212890625, "learning_rate": 6.662538425262284e-07, "loss": -0.0412, "reward": 3.75, "reward_std": 2.802945911884308, "rewards/accuracy_reward_staging": 0.375, "rewards/format_reward": 0.90625, "rewards/format_reward_staging": 0.96875, "step": 169 }, { "clip_ratio": 0.0, "completion_length": 607.5, "epoch": 42.44444444444444, "grad_norm": 1.5445749846218586, "kl": 0.05462646484375, "learning_rate": 6.623497346023417e-07, "loss": -0.0053, "reward": 3.0625, "reward_std": 1.4321783781051636, "rewards/accuracy_reward_staging": 0.25, "rewards/format_reward": 0.90625, "rewards/format_reward_staging": 0.90625, "step": 170 }, { "clip_ratio": 0.0, "completion_length": 531.53125, "epoch": 42.666666666666664, "grad_norm": 1.727090236953967, "kl": 0.05303955078125, "learning_rate": 6.584345302897522e-07, "loss": 0.0752, "reward": 4.9375, "reward_std": 2.6843830347061157, "rewards/accuracy_reward_staging": 0.59375, "rewards/format_reward": 0.96875, "rewards/format_reward_staging": 1.0, "step": 171 }, { "clip_ratio": 0.0, "completion_length": 511.125, "epoch": 42.888888888888886, "grad_norm": 1.463526052255072, "kl": 0.05108642578125, "learning_rate": 6.545084971874736e-07, "loss": -0.0218, "reward": 4.28125, "reward_std": 2.3289482593536377, "rewards/accuracy_reward_staging": 0.46875, "rewards/format_reward": 0.9375, "rewards/format_reward_staging": 1.0, "step": 172 }, { "clip_ratio": 0.0, "completion_length": 591.46875, "epoch": 43.22222222222222, "grad_norm": 1.6155726361043279, "kl": 0.06103515625, "learning_rate": 6.505719036346537e-07, "loss": 0.0385, "reward": 3.3125, "reward_std": 2.2124131619930267, "rewards/accuracy_reward_staging": 0.3125, "rewards/format_reward": 0.84375, "rewards/format_reward_staging": 0.90625, "step": 173 }, { "clip_ratio": 0.0, "completion_length": 570.96875, "epoch": 43.44444444444444, "grad_norm": 1.3751712901264466, "kl": 0.0545654296875, "learning_rate": 6.466250186922324e-07, "loss": 0.0063, "reward": 3.1875, "reward_std": 2.130874752998352, "rewards/accuracy_reward_staging": 0.3125, "rewards/format_reward": 0.78125, "rewards/format_reward_staging": 0.84375, "step": 174 }, { "clip_ratio": 0.0, "completion_length": 545.71875, "epoch": 43.666666666666664, "grad_norm": 1.4756595692040109, "kl": 0.059326171875, "learning_rate": 6.426681121245527e-07, "loss": -0.0295, "reward": 3.59375, "reward_std": 2.3869778215885162, "rewards/accuracy_reward_staging": 0.34375, "rewards/format_reward": 0.9375, "rewards/format_reward_staging": 0.9375, "step": 175 }, { "clip_ratio": 0.0, "completion_length": 595.1875, "epoch": 43.888888888888886, "grad_norm": 1.4156928353056575, "kl": 0.050048828125, "learning_rate": 6.387014543809223e-07, "loss": -0.0245, "reward": 3.625, "reward_std": 2.184383064508438, "rewards/accuracy_reward_staging": 0.34375, "rewards/format_reward": 0.96875, "rewards/format_reward_staging": 0.9375, "step": 176 }, { "clip_ratio": 0.0, "completion_length": 549.28125, "epoch": 44.22222222222222, "grad_norm": 1.6625979237822903, "kl": 0.05389404296875, "learning_rate": 6.347253165771289e-07, "loss": 0.0393, "reward": 4.34375, "reward_std": 2.0728103518486023, "rewards/accuracy_reward_staging": 0.46875, "rewards/format_reward": 1.0, "rewards/format_reward_staging": 1.0, "step": 177 }, { "clip_ratio": 0.0, "completion_length": 616.4375, "epoch": 44.44444444444444, "grad_norm": 0.9016620089051227, "kl": 0.04852294921875, "learning_rate": 6.307399704769098e-07, "loss": 0.0327, "reward": 3.3125, "reward_std": 1.9239110946655273, "rewards/accuracy_reward_staging": 0.28125, "rewards/format_reward": 0.9375, "rewards/format_reward_staging": 0.96875, "step": 178 }, { "clip_ratio": 0.0, "completion_length": 554.40625, "epoch": 44.666666666666664, "grad_norm": 1.4445648538792832, "kl": 0.06365966796875, "learning_rate": 6.26745688473377e-07, "loss": 0.0527, "reward": 2.90625, "reward_std": 1.2700245678424835, "rewards/accuracy_reward_staging": 0.21875, "rewards/format_reward": 0.875, "rewards/format_reward_staging": 0.9375, "step": 179 }, { "clip_ratio": 0.0, "completion_length": 596.6875, "epoch": 44.888888888888886, "grad_norm": 1.5856705116806837, "kl": 0.06280517578125, "learning_rate": 6.227427435703995e-07, "loss": 0.0488, "reward": 3.59375, "reward_std": 2.1598991453647614, "rewards/accuracy_reward_staging": 0.34375, "rewards/format_reward": 0.875, "rewards/format_reward_staging": 1.0, "step": 180 }, { "clip_ratio": 0.0, "completion_length": 540.375, "epoch": 45.22222222222222, "grad_norm": 1.4832995345417785, "kl": 0.0472412109375, "learning_rate": 6.187314093639443e-07, "loss": 0.021, "reward": 3.8125, "reward_std": 2.2678900957107544, "rewards/accuracy_reward_staging": 0.375, "rewards/format_reward": 0.96875, "rewards/format_reward_staging": 0.96875, "step": 181 }, { "clip_ratio": 0.0, "completion_length": 596.75, "epoch": 45.44444444444444, "grad_norm": 1.5871845074006228, "kl": 0.048828125, "learning_rate": 6.147119600233758e-07, "loss": -0.025, "reward": 4.40625, "reward_std": 2.732926845550537, "rewards/accuracy_reward_staging": 0.5, "rewards/format_reward": 0.96875, "rewards/format_reward_staging": 0.9375, "step": 182 }, { "clip_ratio": 0.0, "completion_length": 636.21875, "epoch": 45.666666666666664, "grad_norm": 1.1682267885626447, "kl": 0.050537109375, "learning_rate": 6.106846702727172e-07, "loss": -0.0041, "reward": 3.5625, "reward_std": 1.9367179870605469, "rewards/accuracy_reward_staging": 0.34375, "rewards/format_reward": 0.875, "rewards/format_reward_staging": 0.96875, "step": 183 }, { "clip_ratio": 0.0, "completion_length": 567.6875, "epoch": 45.888888888888886, "grad_norm": 1.182505436622169, "kl": 0.052490234375, "learning_rate": 6.066498153718734e-07, "loss": -0.0104, "reward": 3.96875, "reward_std": 1.8926886320114136, "rewards/accuracy_reward_staging": 0.40625, "rewards/format_reward": 0.96875, "rewards/format_reward_staging": 0.96875, "step": 184 }, { "clip_ratio": 0.0, "completion_length": 556.84375, "epoch": 46.22222222222222, "grad_norm": 74.95843070592915, "kl": 0.51153564453125, "learning_rate": 6.026076710978171e-07, "loss": -0.0099, "reward": 4.03125, "reward_std": 2.5020731687545776, "rewards/accuracy_reward_staging": 0.4375, "rewards/format_reward": 0.90625, "rewards/format_reward_staging": 0.9375, "step": 185 }, { "clip_ratio": 0.0, "completion_length": 635.9375, "epoch": 46.44444444444444, "grad_norm": 1.1802575443084546, "kl": 0.046630859375, "learning_rate": 5.985585137257401e-07, "loss": -0.0104, "reward": 3.75, "reward_std": 1.5358919501304626, "rewards/accuracy_reward_staging": 0.375, "rewards/format_reward": 0.9375, "rewards/format_reward_staging": 0.9375, "step": 186 }, { "clip_ratio": 0.0, "completion_length": 584.625, "epoch": 46.666666666666664, "grad_norm": 1.540364554923698, "kl": 0.053955078125, "learning_rate": 5.945026200101702e-07, "loss": 0.0173, "reward": 3.71875, "reward_std": 2.7078438997268677, "rewards/accuracy_reward_staging": 0.375, "rewards/format_reward": 0.90625, "rewards/format_reward_staging": 0.9375, "step": 187 }, { "clip_ratio": 0.0, "completion_length": 511.375, "epoch": 46.888888888888886, "grad_norm": 1.3487938182691792, "kl": 0.05859375, "learning_rate": 5.90440267166055e-07, "loss": 0.0363, "reward": 3.125, "reward_std": 2.2170365154743195, "rewards/accuracy_reward_staging": 0.28125, "rewards/format_reward": 0.75, "rewards/format_reward_staging": 0.96875, "step": 188 }, { "clip_ratio": 0.0, "completion_length": 536.40625, "epoch": 47.22222222222222, "grad_norm": 1.7030200868844614, "kl": 0.054931640625, "learning_rate": 5.863717328498152e-07, "loss": 0.0328, "reward": 3.84375, "reward_std": 2.070079743862152, "rewards/accuracy_reward_staging": 0.40625, "rewards/format_reward": 0.84375, "rewards/format_reward_staging": 0.96875, "step": 189 }, { "clip_ratio": 0.0, "completion_length": 555.875, "epoch": 47.44444444444444, "grad_norm": 1.7566836455673576, "kl": 0.05218505859375, "learning_rate": 5.82297295140367e-07, "loss": -0.0381, "reward": 3.75, "reward_std": 2.009314328432083, "rewards/accuracy_reward_staging": 0.375, "rewards/format_reward": 0.90625, "rewards/format_reward_staging": 0.96875, "step": 190 }, { "clip_ratio": 0.0, "completion_length": 545.8125, "epoch": 47.666666666666664, "grad_norm": 1.594063347049537, "kl": 0.05426025390625, "learning_rate": 5.782172325201155e-07, "loss": 0.0535, "reward": 3.21875, "reward_std": 1.7700316905975342, "rewards/accuracy_reward_staging": 0.25, "rewards/format_reward": 0.96875, "rewards/format_reward_staging": 1.0, "step": 191 }, { "clip_ratio": 0.0, "completion_length": 622.8125, "epoch": 47.888888888888886, "grad_norm": 1.5439318867500331, "kl": 0.04937744140625, "learning_rate": 5.741318238559209e-07, "loss": -0.0012, "reward": 4.75, "reward_std": 2.4349581599235535, "rewards/accuracy_reward_staging": 0.5625, "rewards/format_reward": 0.9375, "rewards/format_reward_staging": 1.0, "step": 192 }, { "clip_ratio": 0.0, "completion_length": 583.75, "epoch": 48.22222222222222, "grad_norm": 2.5201319810344454, "kl": 0.0770263671875, "learning_rate": 5.700413483800389e-07, "loss": -0.0762, "reward": 3.4375, "reward_std": 1.82216876745224, "rewards/accuracy_reward_staging": 0.34375, "rewards/format_reward": 0.8125, "rewards/format_reward_staging": 0.90625, "step": 193 }, { "clip_ratio": 0.0, "completion_length": 599.59375, "epoch": 48.44444444444444, "grad_norm": 1.473198815087056, "kl": 0.05352783203125, "learning_rate": 5.659460856710345e-07, "loss": -0.0055, "reward": 3.5625, "reward_std": 1.9599019289016724, "rewards/accuracy_reward_staging": 0.34375, "rewards/format_reward": 0.90625, "rewards/format_reward_staging": 0.9375, "step": 194 }, { "clip_ratio": 0.0, "completion_length": 569.75, "epoch": 48.666666666666664, "grad_norm": 1.6168573027198114, "kl": 0.05010986328125, "learning_rate": 5.618463156346739e-07, "loss": -0.0075, "reward": 4.21875, "reward_std": 1.739636391401291, "rewards/accuracy_reward_staging": 0.46875, "rewards/format_reward": 0.9375, "rewards/format_reward_staging": 0.9375, "step": 195 }, { "clip_ratio": 0.0, "completion_length": 577.375, "epoch": 48.888888888888886, "grad_norm": 1.5839729942600627, "kl": 0.04180908203125, "learning_rate": 5.577423184847931e-07, "loss": 0.0086, "reward": 3.875, "reward_std": 2.332531690597534, "rewards/accuracy_reward_staging": 0.375, "rewards/format_reward": 1.0, "rewards/format_reward_staging": 1.0, "step": 196 }, { "clip_ratio": 0.0, "completion_length": 532.96875, "epoch": 49.22222222222222, "grad_norm": 1.5767088515541903, "kl": 0.04962158203125, "learning_rate": 5.536343747241459e-07, "loss": 0.0159, "reward": 4.15625, "reward_std": 1.9809716939926147, "rewards/accuracy_reward_staging": 0.4375, "rewards/format_reward": 0.96875, "rewards/format_reward_staging": 1.0, "step": 197 }, { "clip_ratio": 0.0, "completion_length": 578.28125, "epoch": 49.44444444444444, "grad_norm": 1.3049917889915577, "kl": 0.04583740234375, "learning_rate": 5.495227651252315e-07, "loss": 0.0386, "reward": 4.53125, "reward_std": 1.7373294830322266, "rewards/accuracy_reward_staging": 0.53125, "rewards/format_reward": 0.90625, "rewards/format_reward_staging": 0.96875, "step": 198 }, { "clip_ratio": 0.0, "completion_length": 546.0625, "epoch": 49.666666666666664, "grad_norm": 1.3164741992532543, "kl": 0.0504150390625, "learning_rate": 5.454077707111041e-07, "loss": 0.0142, "reward": 4.65625, "reward_std": 1.945079743862152, "rewards/accuracy_reward_staging": 0.53125, "rewards/format_reward": 1.0, "rewards/format_reward_staging": 1.0, "step": 199 }, { "epoch": 49.888888888888886, "grad_norm": 1.3350172420738584, "learning_rate": 5.412896727361662e-07, "loss": 0.0656, "step": 200 }, { "epoch": 49.888888888888886, "eval_clip_ratio": 0.0, "eval_completion_length": 600.85, "eval_kl": 0.047802734375, "eval_loss": 0.025471828877925873, "eval_reward": 2.6, "eval_reward_std": 1.3353363513946532, "eval_rewards/accuracy_reward_staging": 0.15, "eval_rewards/format_reward": 0.9, "eval_rewards/format_reward_staging": 0.95, "eval_runtime": 52.2669, "eval_samples_per_second": 0.689, "eval_steps_per_second": 0.096, "step": 200 }, { "clip_ratio": 0.0, "completion_length": 614.453125, "epoch": 50.22222222222222, "grad_norm": 1.284721283794701, "kl": 0.05389404296875, "learning_rate": 5.371687526669439e-07, "loss": 0.0086, "reward": 3.421875, "reward_std": 2.202674761414528, "rewards/accuracy_reward_staging": 0.34375, "rewards/format_reward": 0.8125, "rewards/format_reward_staging": 0.890625, "step": 201 }, { "clip_ratio": 0.0, "completion_length": 545.53125, "epoch": 50.44444444444444, "grad_norm": 1.235503506247465, "kl": 0.0528564453125, "learning_rate": 5.330452921628497e-07, "loss": -0.0137, "reward": 3.5625, "reward_std": 1.246154248714447, "rewards/accuracy_reward_staging": 0.34375, "rewards/format_reward": 0.875, "rewards/format_reward_staging": 0.96875, "step": 202 }, { "clip_ratio": 0.0, "completion_length": 560.25, "epoch": 50.666666666666664, "grad_norm": 1.9492031211380043, "kl": 0.0654296875, "learning_rate": 5.28919573056932e-07, "loss": -0.049, "reward": 4.28125, "reward_std": 2.934589922428131, "rewards/accuracy_reward_staging": 0.5, "rewards/format_reward": 0.875, "rewards/format_reward_staging": 0.90625, "step": 203 }, { "clip_ratio": 0.0, "completion_length": 599.3125, "epoch": 50.888888888888886, "grad_norm": 1.567290502007258, "kl": 0.04364013671875, "learning_rate": 5.247918773366111e-07, "loss": 0.0937, "reward": 3.875, "reward_std": 1.930722177028656, "rewards/accuracy_reward_staging": 0.40625, "rewards/format_reward": 0.90625, "rewards/format_reward_staging": 0.9375, "step": 204 }, { "clip_ratio": 0.0, "completion_length": 562.6875, "epoch": 51.22222222222222, "grad_norm": 1.4477817793212922, "kl": 0.05084228515625, "learning_rate": 5.206624871244065e-07, "loss": 0.0148, "reward": 2.90625, "reward_std": 1.4091877937316895, "rewards/accuracy_reward_staging": 0.1875, "rewards/format_reward": 0.96875, "rewards/format_reward_staging": 1.0, "step": 205 }, { "clip_ratio": 0.0, "completion_length": 605.625, "epoch": 51.44444444444444, "grad_norm": 1.5674813338685252, "kl": 0.04931640625, "learning_rate": 5.165316846586541e-07, "loss": 0.0963, "reward": 3.125, "reward_std": 2.1649354100227356, "rewards/accuracy_reward_staging": 0.28125, "rewards/format_reward": 0.78125, "rewards/format_reward_staging": 0.9375, "step": 206 }, { "clip_ratio": 0.0, "completion_length": 572.5625, "epoch": 51.666666666666664, "grad_norm": 1.521375079838418, "kl": 0.046875, "learning_rate": 5.123997522742151e-07, "loss": 0.0215, "reward": 3.71875, "reward_std": 2.047757565975189, "rewards/accuracy_reward_staging": 0.375, "rewards/format_reward": 0.90625, "rewards/format_reward_staging": 0.9375, "step": 207 }, { "clip_ratio": 0.0, "completion_length": 524.4375, "epoch": 51.888888888888886, "grad_norm": 1.637742061840183, "kl": 0.04779052734375, "learning_rate": 5.082669723831793e-07, "loss": -0.0249, "reward": 3.59375, "reward_std": 2.858625650405884, "rewards/accuracy_reward_staging": 0.34375, "rewards/format_reward": 0.90625, "rewards/format_reward_staging": 0.96875, "step": 208 }, { "clip_ratio": 0.0, "completion_length": 566.875, "epoch": 52.22222222222222, "grad_norm": 1.5832843072882397, "kl": 0.04449462890625, "learning_rate": 5.041336274555625e-07, "loss": -0.063, "reward": 2.84375, "reward_std": 1.2771694660186768, "rewards/accuracy_reward_staging": 0.1875, "rewards/format_reward": 0.9375, "rewards/format_reward_staging": 0.96875, "step": 209 }, { "clip_ratio": 0.0, "completion_length": 576.0625, "epoch": 52.44444444444444, "grad_norm": 1.5508316387978383, "kl": 0.06103515625, "learning_rate": 5e-07, "loss": -0.0291, "reward": 4.0, "reward_std": 2.082531690597534, "rewards/accuracy_reward_staging": 0.40625, "rewards/format_reward": 0.96875, "rewards/format_reward_staging": 1.0, "step": 210 }, { "clip_ratio": 0.0, "completion_length": 611.71875, "epoch": 52.666666666666664, "grad_norm": 1.6164552079690877, "kl": 0.04437255859375, "learning_rate": 4.958663725444375e-07, "loss": 0.0102, "reward": 4.40625, "reward_std": 2.5580477714538574, "rewards/accuracy_reward_staging": 0.5, "rewards/format_reward": 0.9375, "rewards/format_reward_staging": 0.96875, "step": 211 }, { "clip_ratio": 0.0, "completion_length": 563.3125, "epoch": 52.888888888888886, "grad_norm": 1.5726439650006456, "kl": 0.05096435546875, "learning_rate": 4.917330276168208e-07, "loss": -0.0031, "reward": 4.96875, "reward_std": 2.3175911903381348, "rewards/accuracy_reward_staging": 0.625, "rewards/format_reward": 0.84375, "rewards/format_reward_staging": 1.0, "step": 212 }, { "clip_ratio": 0.0, "completion_length": 616.375, "epoch": 53.22222222222222, "grad_norm": 1.7880025106936461, "kl": 0.04498291015625, "learning_rate": 4.87600247725785e-07, "loss": 0.066, "reward": 3.1875, "reward_std": 1.891027882695198, "rewards/accuracy_reward_staging": 0.28125, "rewards/format_reward": 0.875, "rewards/format_reward_staging": 0.90625, "step": 213 }, { "clip_ratio": 0.0, "completion_length": 556.6875, "epoch": 53.44444444444444, "grad_norm": 2.0232137942713573, "kl": 0.0498046875, "learning_rate": 4.834683153413459e-07, "loss": 0.0311, "reward": 3.5625, "reward_std": 1.6434174478054047, "rewards/accuracy_reward_staging": 0.34375, "rewards/format_reward": 0.875, "rewards/format_reward_staging": 0.96875, "step": 214 }, { "clip_ratio": 0.0, "completion_length": 555.09375, "epoch": 53.666666666666664, "grad_norm": 1.4253180533139413, "kl": 0.0416259765625, "learning_rate": 4.793375128755933e-07, "loss": -0.0401, "reward": 4.03125, "reward_std": 2.570079743862152, "rewards/accuracy_reward_staging": 0.40625, "rewards/format_reward": 1.0, "rewards/format_reward_staging": 1.0, "step": 215 }, { "clip_ratio": 0.0, "completion_length": 597.4375, "epoch": 53.888888888888886, "grad_norm": 1.740974206086713, "kl": 0.04815673828125, "learning_rate": 4.752081226633888e-07, "loss": -0.038, "reward": 4.34375, "reward_std": 2.6059716939926147, "rewards/accuracy_reward_staging": 0.5, "rewards/format_reward": 0.875, "rewards/format_reward_staging": 0.96875, "step": 216 }, { "clip_ratio": 0.0, "completion_length": 564.0625, "epoch": 54.22222222222222, "grad_norm": 1.6032171003103113, "kl": 0.05572509765625, "learning_rate": 4.71080426943068e-07, "loss": 0.0092, "reward": 3.0625, "reward_std": 1.996816635131836, "rewards/accuracy_reward_staging": 0.28125, "rewards/format_reward": 0.75, "rewards/format_reward_staging": 0.90625, "step": 217 }, { "clip_ratio": 0.0, "completion_length": 530.03125, "epoch": 54.44444444444444, "grad_norm": 1.3127356050754018, "kl": 0.0540771484375, "learning_rate": 4.669547078371503e-07, "loss": -0.0245, "reward": 6.59375, "reward_std": 2.073159486055374, "rewards/accuracy_reward_staging": 0.9375, "rewards/format_reward": 0.9375, "rewards/format_reward_staging": 0.96875, "step": 218 }, { "clip_ratio": 0.0, "completion_length": 608.3125, "epoch": 54.666666666666664, "grad_norm": 1.7016991990394592, "kl": 0.05010986328125, "learning_rate": 4.628312473330562e-07, "loss": 0.0702, "reward": 3.875, "reward_std": 2.482748866081238, "rewards/accuracy_reward_staging": 0.40625, "rewards/format_reward": 0.875, "rewards/format_reward_staging": 0.96875, "step": 219 }, { "clip_ratio": 0.0, "completion_length": 549.375, "epoch": 54.888888888888886, "grad_norm": 1.3141608271515532, "kl": 0.04742431640625, "learning_rate": 4.5871032726383385e-07, "loss": 0.0552, "reward": 3.125, "reward_std": 1.3886407911777496, "rewards/accuracy_reward_staging": 0.25, "rewards/format_reward": 0.90625, "rewards/format_reward_staging": 0.96875, "step": 220 }, { "clip_ratio": 0.0, "completion_length": 578.71875, "epoch": 55.22222222222222, "grad_norm": 1.3932263109990592, "kl": 0.04364013671875, "learning_rate": 4.5459222928889587e-07, "loss": 0.051, "reward": 3.71875, "reward_std": 1.7805703282356262, "rewards/accuracy_reward_staging": 0.375, "rewards/format_reward": 0.90625, "rewards/format_reward_staging": 0.9375, "step": 221 }, { "clip_ratio": 0.0, "completion_length": 588.4375, "epoch": 55.44444444444444, "grad_norm": 1.5339621078239263, "kl": 0.04962158203125, "learning_rate": 4.5047723487476864e-07, "loss": -0.0216, "reward": 3.46875, "reward_std": 2.488185405731201, "rewards/accuracy_reward_staging": 0.3125, "rewards/format_reward": 0.90625, "rewards/format_reward_staging": 1.0, "step": 222 }, { "clip_ratio": 0.0, "completion_length": 585.34375, "epoch": 55.666666666666664, "grad_norm": 1.6607509386936015, "kl": 0.04962158203125, "learning_rate": 4.463656252758542e-07, "loss": 0.0452, "reward": 3.8125, "reward_std": 2.171033263206482, "rewards/accuracy_reward_staging": 0.40625, "rewards/format_reward": 0.84375, "rewards/format_reward_staging": 0.9375, "step": 223 }, { "clip_ratio": 0.0, "completion_length": 631.75, "epoch": 55.888888888888886, "grad_norm": 1.5614778713632624, "kl": 0.04669189453125, "learning_rate": 4.4225768151520694e-07, "loss": 0.0801, "reward": 3.5625, "reward_std": 2.430722177028656, "rewards/accuracy_reward_staging": 0.34375, "rewards/format_reward": 0.90625, "rewards/format_reward_staging": 0.9375, "step": 224 }, { "clip_ratio": 0.0, "completion_length": 613.75, "epoch": 56.22222222222222, "grad_norm": 1.5004046938088074, "kl": 0.05950927734375, "learning_rate": 4.381536843653261e-07, "loss": 0.0698, "reward": 3.59375, "reward_std": 2.5734615325927734, "rewards/accuracy_reward_staging": 0.375, "rewards/format_reward": 0.78125, "rewards/format_reward_staging": 0.9375, "step": 225 }, { "clip_ratio": 0.0, "completion_length": 565.375, "epoch": 56.44444444444444, "grad_norm": 1.3766714019303354, "kl": 0.04168701171875, "learning_rate": 4.340539143289655e-07, "loss": 0.0233, "reward": 3.5, "reward_std": 2.0, "rewards/accuracy_reward_staging": 0.3125, "rewards/format_reward": 0.96875, "rewards/format_reward_staging": 0.96875, "step": 226 }, { "clip_ratio": 0.0, "completion_length": 553.125, "epoch": 56.666666666666664, "grad_norm": 1.307050706736634, "kl": 0.05133056640625, "learning_rate": 4.2995865161996104e-07, "loss": 0.0181, "reward": 4.0625, "reward_std": 2.421202301979065, "rewards/accuracy_reward_staging": 0.4375, "rewards/format_reward": 0.90625, "rewards/format_reward_staging": 0.96875, "step": 227 }, { "clip_ratio": 0.0, "completion_length": 516.4375, "epoch": 56.888888888888886, "grad_norm": 1.5405733998671278, "kl": 0.0562744140625, "learning_rate": 4.258681761440789e-07, "loss": 0.0017, "reward": 4.03125, "reward_std": 2.49512779712677, "rewards/accuracy_reward_staging": 0.40625, "rewards/format_reward": 1.0, "rewards/format_reward_staging": 1.0, "step": 228 }, { "clip_ratio": 0.0, "completion_length": 621.84375, "epoch": 57.22222222222222, "grad_norm": 1.606949877632979, "kl": 0.044189453125, "learning_rate": 4.2178276747988444e-07, "loss": -0.0076, "reward": 4.3125, "reward_std": 2.390491783618927, "rewards/accuracy_reward_staging": 0.5, "rewards/format_reward": 0.90625, "rewards/format_reward_staging": 0.90625, "step": 229 }, { "clip_ratio": 0.0, "completion_length": 597.84375, "epoch": 57.44444444444444, "grad_norm": 1.5411205221206894, "kl": 0.0574951171875, "learning_rate": 4.1770270485963294e-07, "loss": -0.0387, "reward": 3.125, "reward_std": 2.1638975143432617, "rewards/accuracy_reward_staging": 0.25, "rewards/format_reward": 0.90625, "rewards/format_reward_staging": 0.96875, "step": 230 }, { "clip_ratio": 0.0, "completion_length": 603.1875, "epoch": 57.666666666666664, "grad_norm": 1.3383276534008064, "kl": 0.04473876953125, "learning_rate": 4.1362826715018497e-07, "loss": 0.0122, "reward": 3.6875, "reward_std": 1.9202269613742828, "rewards/accuracy_reward_staging": 0.375, "rewards/format_reward": 0.90625, "rewards/format_reward_staging": 0.90625, "step": 231 }, { "clip_ratio": 0.0, "completion_length": 552.25, "epoch": 57.888888888888886, "grad_norm": 1.7484795613881616, "kl": 0.06341552734375, "learning_rate": 4.095597328339452e-07, "loss": -0.0426, "reward": 4.46875, "reward_std": 2.5560158491134644, "rewards/accuracy_reward_staging": 0.53125, "rewards/format_reward": 0.84375, "rewards/format_reward_staging": 0.96875, "step": 232 }, { "clip_ratio": 0.0, "completion_length": 585.125, "epoch": 58.22222222222222, "grad_norm": 1.5442704440086175, "kl": 0.05377197265625, "learning_rate": 4.0549737998982994e-07, "loss": -0.0062, "reward": 3.65625, "reward_std": 2.2512659430503845, "rewards/accuracy_reward_staging": 0.34375, "rewards/format_reward": 0.96875, "rewards/format_reward_staging": 0.96875, "step": 233 }, { "clip_ratio": 0.0, "completion_length": 601.59375, "epoch": 58.44444444444444, "grad_norm": 1.3070749287077408, "kl": 0.05706787109375, "learning_rate": 4.0144148627425986e-07, "loss": 0.0357, "reward": 4.5625, "reward_std": 2.173893690109253, "rewards/accuracy_reward_staging": 0.53125, "rewards/format_reward": 0.90625, "rewards/format_reward_staging": 1.0, "step": 234 }, { "clip_ratio": 0.0, "completion_length": 549.25, "epoch": 58.666666666666664, "grad_norm": 1.568215525888831, "kl": 0.04644775390625, "learning_rate": 3.973923289021829e-07, "loss": -0.0236, "reward": 3.375, "reward_std": 2.125, "rewards/accuracy_reward_staging": 0.28125, "rewards/format_reward": 0.96875, "rewards/format_reward_staging": 1.0, "step": 235 }, { "clip_ratio": 0.0, "completion_length": 559.875, "epoch": 58.888888888888886, "grad_norm": 1.247655763308189, "kl": 0.05523681640625, "learning_rate": 3.9335018462812664e-07, "loss": 0.0335, "reward": 4.40625, "reward_std": 1.7515006065368652, "rewards/accuracy_reward_staging": 0.5, "rewards/format_reward": 0.9375, "rewards/format_reward_staging": 0.96875, "step": 236 }, { "clip_ratio": 0.0, "completion_length": 608.75, "epoch": 59.22222222222222, "grad_norm": 1.4876134624852135, "kl": 0.05291748046875, "learning_rate": 3.893153297272828e-07, "loss": 0.0246, "reward": 3.28125, "reward_std": 1.5280899405479431, "rewards/accuracy_reward_staging": 0.3125, "rewards/format_reward": 0.78125, "rewards/format_reward_staging": 0.9375, "step": 237 }, { "clip_ratio": 0.0, "completion_length": 583.0, "epoch": 59.44444444444444, "grad_norm": 1.6243663627358595, "kl": 0.04718017578125, "learning_rate": 3.8528803997662423e-07, "loss": -0.0226, "reward": 4.5625, "reward_std": 2.9370444416999817, "rewards/accuracy_reward_staging": 0.53125, "rewards/format_reward": 0.9375, "rewards/format_reward_staging": 0.96875, "step": 238 }, { "clip_ratio": 0.0, "completion_length": 584.90625, "epoch": 59.666666666666664, "grad_norm": 1.579154131750563, "kl": 0.05328369140625, "learning_rate": 3.812685906360557e-07, "loss": -0.0118, "reward": 3.5625, "reward_std": 1.8252411782741547, "rewards/accuracy_reward_staging": 0.34375, "rewards/format_reward": 0.875, "rewards/format_reward_staging": 0.96875, "step": 239 }, { "clip_ratio": 0.0, "completion_length": 602.875, "epoch": 59.888888888888886, "grad_norm": 1.6568742956735238, "kl": 0.05029296875, "learning_rate": 3.772572564296004e-07, "loss": 0.0049, "reward": 4.21875, "reward_std": 2.6711304783821106, "rewards/accuracy_reward_staging": 0.46875, "rewards/format_reward": 0.90625, "rewards/format_reward_staging": 0.96875, "step": 240 }, { "clip_ratio": 0.0, "completion_length": 527.71875, "epoch": 60.22222222222222, "grad_norm": 1.5110623474715636, "kl": 0.05316162109375, "learning_rate": 3.7325431152662294e-07, "loss": 0.004, "reward": 3.65625, "reward_std": 2.44047012925148, "rewards/accuracy_reward_staging": 0.34375, "rewards/format_reward": 0.96875, "rewards/format_reward_staging": 0.96875, "step": 241 }, { "clip_ratio": 0.0, "completion_length": 607.71875, "epoch": 60.44444444444444, "grad_norm": 1.5588742571636938, "kl": 0.05126953125, "learning_rate": 3.692600295230901e-07, "loss": 0.0174, "reward": 4.125, "reward_std": 2.93262779712677, "rewards/accuracy_reward_staging": 0.4375, "rewards/format_reward": 0.96875, "rewards/format_reward_staging": 0.96875, "step": 242 }, { "clip_ratio": 0.0, "completion_length": 598.375, "epoch": 60.666666666666664, "grad_norm": 1.4200468362196192, "kl": 0.05487060546875, "learning_rate": 3.6527468342287096e-07, "loss": 0.1256, "reward": 3.8125, "reward_std": 2.782258152961731, "rewards/accuracy_reward_staging": 0.40625, "rewards/format_reward": 0.875, "rewards/format_reward_staging": 0.90625, "step": 243 }, { "clip_ratio": 0.0, "completion_length": 598.9375, "epoch": 60.888888888888886, "grad_norm": 2.112230364965324, "kl": 0.06414794921875, "learning_rate": 3.612985456190778e-07, "loss": -0.0099, "reward": 4.0625, "reward_std": 2.503733992576599, "rewards/accuracy_reward_staging": 0.4375, "rewards/format_reward": 0.875, "rewards/format_reward_staging": 1.0, "step": 244 }, { "clip_ratio": 0.0, "completion_length": 580.84375, "epoch": 61.22222222222222, "grad_norm": 1.5256554050376716, "kl": 0.0540771484375, "learning_rate": 3.5733188787544746e-07, "loss": 0.0285, "reward": 3.75, "reward_std": 2.553140878677368, "rewards/accuracy_reward_staging": 0.375, "rewards/format_reward": 0.90625, "rewards/format_reward_staging": 0.96875, "step": 245 }, { "clip_ratio": 0.0, "completion_length": 553.53125, "epoch": 61.44444444444444, "grad_norm": 1.5714805767176323, "kl": 0.0645751953125, "learning_rate": 3.533749813077677e-07, "loss": 0.0666, "reward": 4.71875, "reward_std": 2.595756232738495, "rewards/accuracy_reward_staging": 0.59375, "rewards/format_reward": 0.8125, "rewards/format_reward_staging": 0.9375, "step": 246 }, { "clip_ratio": 0.0, "completion_length": 570.90625, "epoch": 61.666666666666664, "grad_norm": 1.3717833382169582, "kl": 0.05242919921875, "learning_rate": 3.4942809636534633e-07, "loss": 0.0464, "reward": 4.375, "reward_std": 1.9917186498641968, "rewards/accuracy_reward_staging": 0.5, "rewards/format_reward": 0.90625, "rewards/format_reward_staging": 0.96875, "step": 247 }, { "clip_ratio": 0.0, "completion_length": 648.6875, "epoch": 61.888888888888886, "grad_norm": 1.281888219474357, "kl": 0.05694580078125, "learning_rate": 3.454915028125263e-07, "loss": -0.0053, "reward": 4.1875, "reward_std": 1.8432075381278992, "rewards/accuracy_reward_staging": 0.46875, "rewards/format_reward": 0.9375, "rewards/format_reward_staging": 0.90625, "step": 248 }, { "clip_ratio": 0.0, "completion_length": 601.28125, "epoch": 62.22222222222222, "grad_norm": 1.2189149322070956, "kl": 0.05279541015625, "learning_rate": 3.415654697102478e-07, "loss": -0.0095, "reward": 3.65625, "reward_std": 1.4233438968658447, "rewards/accuracy_reward_staging": 0.34375, "rewards/format_reward": 0.96875, "rewards/format_reward_staging": 0.96875, "step": 249 }, { "epoch": 62.44444444444444, "grad_norm": 1.7869776388477054, "learning_rate": 3.3765026539765827e-07, "loss": 0.0694, "step": 250 }, { "epoch": 62.44444444444444, "eval_clip_ratio": 0.0, "eval_completion_length": 597.85, "eval_kl": 0.050439453125, "eval_loss": 0.033870112150907516, "eval_reward": 2.5, "eval_reward_std": 1.5911447525024414, "eval_rewards/accuracy_reward_staging": 0.15, "eval_rewards/format_reward": 0.825, "eval_rewards/format_reward_staging": 0.925, "eval_runtime": 53.5113, "eval_samples_per_second": 0.673, "eval_steps_per_second": 0.093, "step": 250 }, { "clip_ratio": 0.0, "completion_length": 543.40625, "epoch": 62.666666666666664, "grad_norm": 1.580273768681387, "kl": 0.058380126953125, "learning_rate": 3.337461574737716e-07, "loss": 0.0381, "reward": 3.59375, "reward_std": 1.963532954454422, "rewards/accuracy_reward_staging": 0.359375, "rewards/format_reward": 0.859375, "rewards/format_reward_staging": 0.9375, "step": 251 }, { "clip_ratio": 0.0, "completion_length": 629.9375, "epoch": 62.888888888888886, "grad_norm": 1.4446074061408753, "kl": 0.04742431640625, "learning_rate": 3.2985341277917846e-07, "loss": 0.0576, "reward": 3.5625, "reward_std": 1.8048822581768036, "rewards/accuracy_reward_staging": 0.34375, "rewards/format_reward": 0.875, "rewards/format_reward_staging": 0.96875, "step": 252 }, { "clip_ratio": 0.0, "completion_length": 610.03125, "epoch": 63.22222222222222, "grad_norm": 2.4797282896452084, "kl": 0.06011962890625, "learning_rate": 3.2597229737780774e-07, "loss": 0.0258, "reward": 2.71875, "reward_std": 1.841366171836853, "rewards/accuracy_reward_staging": 0.21875, "rewards/format_reward": 0.78125, "rewards/format_reward_staging": 0.84375, "step": 253 }, { "clip_ratio": 0.0, "completion_length": 535.8125, "epoch": 63.44444444444444, "grad_norm": 1.413545282954978, "kl": 0.04827880859375, "learning_rate": 3.221030765387417e-07, "loss": 0.0266, "reward": 4.0, "reward_std": 1.7409893572330475, "rewards/accuracy_reward_staging": 0.4375, "rewards/format_reward": 0.90625, "rewards/format_reward_staging": 0.90625, "step": 254 }, { "clip_ratio": 0.0, "completion_length": 625.125, "epoch": 63.666666666666664, "grad_norm": 1.4981329871397806, "kl": 0.04962158203125, "learning_rate": 3.1824601471808497e-07, "loss": 0.0841, "reward": 4.5625, "reward_std": 3.0762142539024353, "rewards/accuracy_reward_staging": 0.5625, "rewards/format_reward": 0.8125, "rewards/format_reward_staging": 0.9375, "step": 255 }, { "clip_ratio": 0.0, "completion_length": 633.125, "epoch": 63.888888888888886, "grad_norm": 1.4534244133457102, "kl": 0.04656982421875, "learning_rate": 3.1440137554088953e-07, "loss": 0.029, "reward": 3.84375, "reward_std": 2.296931117773056, "rewards/accuracy_reward_staging": 0.375, "rewards/format_reward": 0.96875, "rewards/format_reward_staging": 1.0, "step": 256 }, { "clip_ratio": 0.0, "completion_length": 519.59375, "epoch": 64.22222222222223, "grad_norm": 1.586114819510263, "kl": 0.05950927734375, "learning_rate": 3.1056942178313604e-07, "loss": 0.0666, "reward": 4.375, "reward_std": 2.7632179856300354, "rewards/accuracy_reward_staging": 0.5, "rewards/format_reward": 0.9375, "rewards/format_reward_staging": 0.9375, "step": 257 }, { "clip_ratio": 0.0, "completion_length": 554.5, "epoch": 64.44444444444444, "grad_norm": 1.4820895514814123, "kl": 0.057373046875, "learning_rate": 3.06750415353774e-07, "loss": 0.015, "reward": 4.34375, "reward_std": 2.6667675375938416, "rewards/accuracy_reward_staging": 0.46875, "rewards/format_reward": 1.0, "rewards/format_reward_staging": 1.0, "step": 258 }, { "clip_ratio": 0.0, "completion_length": 594.3125, "epoch": 64.66666666666667, "grad_norm": 1.4710703551980364, "kl": 0.05108642578125, "learning_rate": 3.029446172768193e-07, "loss": -0.0532, "reward": 3.71875, "reward_std": 1.9592358469963074, "rewards/accuracy_reward_staging": 0.375, "rewards/format_reward": 0.875, "rewards/format_reward_staging": 0.96875, "step": 259 }, { "clip_ratio": 0.0, "completion_length": 615.125, "epoch": 64.88888888888889, "grad_norm": 1.195494273136427, "kl": 0.05078125, "learning_rate": 2.9915228767351535e-07, "loss": -0.0471, "reward": 3.71875, "reward_std": 1.5842358469963074, "rewards/accuracy_reward_staging": 0.34375, "rewards/format_reward": 1.0, "rewards/format_reward_staging": 1.0, "step": 260 }, { "clip_ratio": 0.0, "completion_length": 582.78125, "epoch": 65.22222222222223, "grad_norm": 1.0142470745003664, "kl": 0.0562744140625, "learning_rate": 2.9537368574455303e-07, "loss": 0.0116, "reward": 3.90625, "reward_std": 1.3764855861663818, "rewards/accuracy_reward_staging": 0.40625, "rewards/format_reward": 0.90625, "rewards/format_reward_staging": 0.96875, "step": 261 }, { "clip_ratio": 0.0, "completion_length": 551.5625, "epoch": 65.44444444444444, "grad_norm": 1.2621208103940496, "kl": 0.045166015625, "learning_rate": 2.916090697523549e-07, "loss": 0.0065, "reward": 3.5625, "reward_std": 1.8217839002609253, "rewards/accuracy_reward_staging": 0.3125, "rewards/format_reward": 1.0, "rewards/format_reward_staging": 1.0, "step": 262 }, { "clip_ratio": 0.0, "completion_length": 639.625, "epoch": 65.66666666666667, "grad_norm": 1.2178177535688726, "kl": 0.06982421875, "learning_rate": 2.878586970034232e-07, "loss": 0.0063, "reward": 2.8125, "reward_std": 1.2878219783306122, "rewards/accuracy_reward_staging": 0.21875, "rewards/format_reward": 0.84375, "rewards/format_reward_staging": 0.875, "step": 263 }, { "clip_ratio": 0.0, "completion_length": 582.875, "epoch": 65.88888888888889, "grad_norm": 1.6224844954097977, "kl": 0.04864501953125, "learning_rate": 2.841228238307536e-07, "loss": -0.0201, "reward": 5.03125, "reward_std": 2.2327269315719604, "rewards/accuracy_reward_staging": 0.625, "rewards/format_reward": 0.9375, "rewards/format_reward_staging": 0.96875, "step": 264 }, { "clip_ratio": 0.0, "completion_length": 637.15625, "epoch": 66.22222222222223, "grad_norm": 1.2308430687264216, "kl": 0.05279541015625, "learning_rate": 2.8040170557631485e-07, "loss": 0.0153, "reward": 3.46875, "reward_std": 2.0372338593006134, "rewards/accuracy_reward_staging": 0.3125, "rewards/format_reward": 0.9375, "rewards/format_reward_staging": 0.96875, "step": 265 }, { "clip_ratio": 0.0, "completion_length": 600.4375, "epoch": 66.44444444444444, "grad_norm": 1.5321111568180714, "kl": 0.04827880859375, "learning_rate": 2.7669559657359673e-07, "loss": -0.0491, "reward": 3.8125, "reward_std": 2.4646694660186768, "rewards/accuracy_reward_staging": 0.375, "rewards/format_reward": 0.9375, "rewards/format_reward_staging": 1.0, "step": 266 }, { "clip_ratio": 0.0, "completion_length": 615.5, "epoch": 66.66666666666667, "grad_norm": 1.3679824700888612, "kl": 0.05462646484375, "learning_rate": 2.730047501302266e-07, "loss": 0.0308, "reward": 3.09375, "reward_std": 2.642750769853592, "rewards/accuracy_reward_staging": 0.28125, "rewards/format_reward": 0.78125, "rewards/format_reward_staging": 0.90625, "step": 267 }, { "clip_ratio": 0.0, "completion_length": 549.5, "epoch": 66.88888888888889, "grad_norm": 1.5033503223897624, "kl": 0.0577392578125, "learning_rate": 2.6932941851065615e-07, "loss": -0.0215, "reward": 4.9375, "reward_std": 2.482675850391388, "rewards/accuracy_reward_staging": 0.59375, "rewards/format_reward": 0.96875, "rewards/format_reward_staging": 1.0, "step": 268 }, { "clip_ratio": 0.0, "completion_length": 553.34375, "epoch": 67.22222222222223, "grad_norm": 1.4290897914516596, "kl": 0.05340576171875, "learning_rate": 2.656698529189193e-07, "loss": 0.0366, "reward": 3.78125, "reward_std": 1.9895031452178955, "rewards/accuracy_reward_staging": 0.375, "rewards/format_reward": 0.90625, "rewards/format_reward_staging": 1.0, "step": 269 }, { "clip_ratio": 0.0, "completion_length": 575.1875, "epoch": 67.44444444444444, "grad_norm": 1.579184850033335, "kl": 0.0518798828125, "learning_rate": 2.620263034814632e-07, "loss": 0.0078, "reward": 4.4375, "reward_std": 2.323539137840271, "rewards/accuracy_reward_staging": 0.53125, "rewards/format_reward": 0.875, "rewards/format_reward_staging": 0.90625, "step": 270 }, { "clip_ratio": 0.0, "completion_length": 584.34375, "epoch": 67.66666666666667, "grad_norm": 1.482511162141472, "kl": 0.0482177734375, "learning_rate": 2.58399019230052e-07, "loss": -0.0587, "reward": 3.6875, "reward_std": 2.195499747991562, "rewards/accuracy_reward_staging": 0.34375, "rewards/format_reward": 0.96875, "rewards/format_reward_staging": 1.0, "step": 271 }, { "clip_ratio": 0.0, "completion_length": 592.0, "epoch": 67.88888888888889, "grad_norm": 1.4652114217200525, "kl": 0.049560546875, "learning_rate": 2.547882480847461e-07, "loss": 0.0021, "reward": 3.1875, "reward_std": 2.073539137840271, "rewards/accuracy_reward_staging": 0.25, "rewards/format_reward": 0.96875, "rewards/format_reward_staging": 0.96875, "step": 272 }, { "clip_ratio": 0.0, "completion_length": 598.59375, "epoch": 68.22222222222223, "grad_norm": 1.5345326135427537, "kl": 0.04913330078125, "learning_rate": 2.5119423683695657e-07, "loss": -0.0357, "reward": 4.25, "reward_std": 2.9848236441612244, "rewards/accuracy_reward_staging": 0.46875, "rewards/format_reward": 0.9375, "rewards/format_reward_staging": 0.96875, "step": 273 }, { "clip_ratio": 0.0, "completion_length": 597.4375, "epoch": 68.44444444444444, "grad_norm": 1.5990838171502337, "kl": 0.061279296875, "learning_rate": 2.476172311325783e-07, "loss": 0.0292, "reward": 5.1875, "reward_std": 2.957588255405426, "rewards/accuracy_reward_staging": 0.6875, "rewards/format_reward": 0.84375, "rewards/format_reward_staging": 0.90625, "step": 274 }, { "clip_ratio": 0.0, "completion_length": 632.71875, "epoch": 68.66666666666667, "grad_norm": 2.3340038254897566, "kl": 0.06951904296875, "learning_rate": 2.440574754551996e-07, "loss": 0.0246, "reward": 3.5, "reward_std": 2.0238241851329803, "rewards/accuracy_reward_staging": 0.3125, "rewards/format_reward": 0.9375, "rewards/format_reward_staging": 1.0, "step": 275 }, { "clip_ratio": 0.0, "completion_length": 648.0625, "epoch": 68.88888888888889, "grad_norm": 1.5884432300379054, "kl": 0.04443359375, "learning_rate": 2.4051521310939254e-07, "loss": 0.1177, "reward": 4.0, "reward_std": 1.8069141209125519, "rewards/accuracy_reward_staging": 0.4375, "rewards/format_reward": 0.90625, "rewards/format_reward_staging": 0.90625, "step": 276 }, { "clip_ratio": 0.0, "completion_length": 546.625, "epoch": 69.22222222222223, "grad_norm": 2.985922020759926, "kl": 0.10992431640625, "learning_rate": 2.3699068620408301e-07, "loss": 0.0152, "reward": 3.15625, "reward_std": 1.511039137840271, "rewards/accuracy_reward_staging": 0.25, "rewards/format_reward": 0.90625, "rewards/format_reward_staging": 1.0, "step": 277 }, { "clip_ratio": 0.0, "completion_length": 602.25, "epoch": 69.44444444444444, "grad_norm": 1.5923870878518707, "kl": 0.056396484375, "learning_rate": 2.3348413563600323e-07, "loss": 0.0176, "reward": 4.5, "reward_std": 2.31710484623909, "rewards/accuracy_reward_staging": 0.53125, "rewards/format_reward": 0.875, "rewards/format_reward_staging": 0.96875, "step": 278 }, { "clip_ratio": 0.0, "completion_length": 598.46875, "epoch": 69.66666666666667, "grad_norm": 1.5951806343259411, "kl": 0.04925537109375, "learning_rate": 2.2999580107322654e-07, "loss": 0.0929, "reward": 4.9375, "reward_std": 2.494741439819336, "rewards/accuracy_reward_staging": 0.59375, "rewards/format_reward": 0.96875, "rewards/format_reward_staging": 1.0, "step": 279 }, { "clip_ratio": 0.0, "completion_length": 520.6875, "epoch": 69.88888888888889, "grad_norm": 1.5129561001363085, "kl": 0.0699462890625, "learning_rate": 2.2652592093878665e-07, "loss": 0.0125, "reward": 4.25, "reward_std": 1.878759890794754, "rewards/accuracy_reward_staging": 0.5, "rewards/format_reward": 0.8125, "rewards/format_reward_staging": 0.9375, "step": 280 }, { "clip_ratio": 0.0, "completion_length": 645.0, "epoch": 70.22222222222223, "grad_norm": 1.5182810808110982, "kl": 0.0643310546875, "learning_rate": 2.2307473239438152e-07, "loss": 0.01, "reward": 4.40625, "reward_std": 2.5910332798957825, "rewards/accuracy_reward_staging": 0.5, "rewards/format_reward": 0.90625, "rewards/format_reward_staging": 1.0, "step": 281 }, { "clip_ratio": 0.0, "completion_length": 569.21875, "epoch": 70.44444444444444, "grad_norm": 1.8382342741040079, "kl": 0.05499267578125, "learning_rate": 2.1964247132416368e-07, "loss": 0.0019, "reward": 4.40625, "reward_std": 3.0214737951755524, "rewards/accuracy_reward_staging": 0.5625, "rewards/format_reward": 0.6875, "rewards/format_reward_staging": 0.90625, "step": 282 }, { "clip_ratio": 0.0, "completion_length": 608.0, "epoch": 70.66666666666667, "grad_norm": 1.7202842060510593, "kl": 0.04736328125, "learning_rate": 2.1622937231861822e-07, "loss": 0.0307, "reward": 3.375, "reward_std": 2.42453271150589, "rewards/accuracy_reward_staging": 0.3125, "rewards/format_reward": 0.9375, "rewards/format_reward_staging": 0.875, "step": 283 }, { "clip_ratio": 0.0, "completion_length": 600.9375, "epoch": 70.88888888888889, "grad_norm": 1.4517912073118557, "kl": 0.04290771484375, "learning_rate": 2.128356686585282e-07, "loss": 0.0476, "reward": 3.75, "reward_std": 1.7858919501304626, "rewards/accuracy_reward_staging": 0.375, "rewards/format_reward": 0.9375, "rewards/format_reward_staging": 0.9375, "step": 284 }, { "clip_ratio": 0.0, "completion_length": 649.53125, "epoch": 71.22222222222223, "grad_norm": 1.259142583692514, "kl": 0.0467529296875, "learning_rate": 2.0946159229903088e-07, "loss": 0.0839, "reward": 2.84375, "reward_std": 1.5846085250377655, "rewards/accuracy_reward_staging": 0.21875, "rewards/format_reward": 0.875, "rewards/format_reward_staging": 0.875, "step": 285 }, { "clip_ratio": 0.0, "completion_length": 625.875, "epoch": 71.44444444444444, "grad_norm": 1.340679975407133, "kl": 0.0565185546875, "learning_rate": 2.0610737385376348e-07, "loss": 0.0085, "reward": 3.59375, "reward_std": 1.975972980260849, "rewards/accuracy_reward_staging": 0.375, "rewards/format_reward": 0.84375, "rewards/format_reward_staging": 0.875, "step": 286 }, { "clip_ratio": 0.0, "completion_length": 603.34375, "epoch": 71.66666666666667, "grad_norm": 1.4289962866267603, "kl": 0.06005859375, "learning_rate": 2.0277324257910106e-07, "loss": 0.0185, "reward": 5.5, "reward_std": 2.4536279439926147, "rewards/accuracy_reward_staging": 0.75, "rewards/format_reward": 0.8125, "rewards/format_reward_staging": 0.9375, "step": 287 }, { "clip_ratio": 0.0, "completion_length": 558.25, "epoch": 71.88888888888889, "grad_norm": 1.6936868571901433, "kl": 0.0546875, "learning_rate": 1.9945942635848745e-07, "loss": 0.0145, "reward": 3.78125, "reward_std": 2.1591877937316895, "rewards/accuracy_reward_staging": 0.375, "rewards/format_reward": 0.9375, "rewards/format_reward_staging": 0.96875, "step": 288 }, { "clip_ratio": 0.0, "completion_length": 636.28125, "epoch": 72.22222222222223, "grad_norm": 1.3457751552584203, "kl": 0.0472412109375, "learning_rate": 1.9616615168685942e-07, "loss": 0.0082, "reward": 3.375, "reward_std": 1.7216877937316895, "rewards/accuracy_reward_staging": 0.28125, "rewards/format_reward": 0.96875, "rewards/format_reward_staging": 1.0, "step": 289 }, { "clip_ratio": 0.0, "completion_length": 565.1875, "epoch": 72.44444444444444, "grad_norm": 1.1708504647450504, "kl": 0.0599365234375, "learning_rate": 1.9289364365516607e-07, "loss": 0.015, "reward": 4.46875, "reward_std": 1.2958193719387054, "rewards/accuracy_reward_staging": 0.53125, "rewards/format_reward": 0.875, "rewards/format_reward_staging": 0.9375, "step": 290 }, { "clip_ratio": 0.0, "completion_length": 597.125, "epoch": 72.66666666666667, "grad_norm": 2.00516555564966, "kl": 0.065185546875, "learning_rate": 1.896421259349844e-07, "loss": 0.0357, "reward": 4.21875, "reward_std": 2.589491307735443, "rewards/accuracy_reward_staging": 0.46875, "rewards/format_reward": 0.90625, "rewards/format_reward_staging": 0.96875, "step": 291 }, { "clip_ratio": 0.0, "completion_length": 672.75, "epoch": 72.88888888888889, "grad_norm": 2.8218244852832335, "kl": 0.09649658203125, "learning_rate": 1.8641182076323148e-07, "loss": -0.0058, "reward": 5.03125, "reward_std": 3.2576534748077393, "rewards/accuracy_reward_staging": 0.625, "rewards/format_reward": 0.9375, "rewards/format_reward_staging": 0.96875, "step": 292 }, { "clip_ratio": 0.0, "completion_length": 577.65625, "epoch": 73.22222222222223, "grad_norm": 1.7581055322994823, "kl": 0.06195068359375, "learning_rate": 1.8320294892697475e-07, "loss": 0.0534, "reward": 3.0, "reward_std": 2.1200742721557617, "rewards/accuracy_reward_staging": 0.28125, "rewards/format_reward": 0.75, "rewards/format_reward_staging": 0.84375, "step": 293 }, { "clip_ratio": 0.0, "completion_length": 562.6875, "epoch": 73.44444444444444, "grad_norm": 1.5050675135024016, "kl": 0.0499267578125, "learning_rate": 1.8001572974834168e-07, "loss": 0.0343, "reward": 4.0, "reward_std": 1.9108919501304626, "rewards/accuracy_reward_staging": 0.40625, "rewards/format_reward": 0.96875, "rewards/format_reward_staging": 1.0, "step": 294 }, { "clip_ratio": 0.0, "completion_length": 590.5, "epoch": 73.66666666666667, "grad_norm": 12.895158631725321, "kl": 0.12432861328125, "learning_rate": 1.768503810695295e-07, "loss": 0.0513, "reward": 3.46875, "reward_std": 1.6672459840774536, "rewards/accuracy_reward_staging": 0.3125, "rewards/format_reward": 0.90625, "rewards/format_reward_staging": 1.0, "step": 295 }, { "clip_ratio": 0.0, "completion_length": 609.125, "epoch": 73.88888888888889, "grad_norm": 1.7151482870021748, "kl": 0.07269287109375, "learning_rate": 1.7370711923791564e-07, "loss": -0.0106, "reward": 5.625, "reward_std": 2.8527393341064453, "rewards/accuracy_reward_staging": 0.78125, "rewards/format_reward": 0.84375, "rewards/format_reward_staging": 0.875, "step": 296 }, { "clip_ratio": 0.0, "completion_length": 675.34375, "epoch": 74.22222222222223, "grad_norm": 1.6055158333490096, "kl": 0.0538330078125, "learning_rate": 1.70586159091271e-07, "loss": 0.0916, "reward": 3.53125, "reward_std": 2.737855911254883, "rewards/accuracy_reward_staging": 0.40625, "rewards/format_reward": 0.6875, "rewards/format_reward_staging": 0.8125, "step": 297 }, { "clip_ratio": 0.0, "completion_length": 638.0, "epoch": 74.44444444444444, "grad_norm": 1.3927154732757459, "kl": 0.0494384765625, "learning_rate": 1.674877139430758e-07, "loss": -0.0039, "reward": 3.5, "reward_std": 2.132579743862152, "rewards/accuracy_reward_staging": 0.3125, "rewards/format_reward": 0.96875, "rewards/format_reward_staging": 0.96875, "step": 298 }, { "clip_ratio": 0.0, "completion_length": 597.78125, "epoch": 74.66666666666667, "grad_norm": 1.2941317675033293, "kl": 0.05804443359375, "learning_rate": 1.6441199556794034e-07, "loss": 0.0582, "reward": 3.28125, "reward_std": 2.0324151515960693, "rewards/accuracy_reward_staging": 0.28125, "rewards/format_reward": 0.90625, "rewards/format_reward_staging": 0.96875, "step": 299 }, { "epoch": 74.88888888888889, "grad_norm": 1.1895166775660873, "learning_rate": 1.6135921418712955e-07, "loss": 0.0154, "step": 300 }, { "epoch": 74.88888888888889, "eval_clip_ratio": 0.0, "eval_completion_length": 558.5, "eval_kl": 0.055908203125, "eval_loss": 0.04830198734998703, "eval_reward": 3.25, "eval_reward_std": 2.227747082710266, "eval_rewards/accuracy_reward_staging": 0.275, "eval_rewards/format_reward": 0.9, "eval_rewards/format_reward_staging": 0.975, "eval_runtime": 50.8525, "eval_samples_per_second": 0.708, "eval_steps_per_second": 0.098, "step": 300 }, { "clip_ratio": 0.0, "completion_length": 522.46875, "epoch": 75.22222222222223, "grad_norm": 1.2719829976418617, "kl": 0.05950927734375, "learning_rate": 1.5832957845419582e-07, "loss": -0.0239, "reward": 4.078125, "reward_std": 1.734619602560997, "rewards/accuracy_reward_staging": 0.4375, "rewards/format_reward": 0.921875, "rewards/format_reward_staging": 0.96875, "step": 301 }, { "clip_ratio": 0.0, "completion_length": 531.75, "epoch": 75.44444444444444, "grad_norm": 1.4701942608061176, "kl": 0.05584716796875, "learning_rate": 1.553232954407171e-07, "loss": -0.0222, "reward": 4.46875, "reward_std": 1.8445461988449097, "rewards/accuracy_reward_staging": 0.5, "rewards/format_reward": 0.96875, "rewards/format_reward_staging": 1.0, "step": 302 }, { "clip_ratio": 0.0, "completion_length": 606.6875, "epoch": 75.66666666666667, "grad_norm": 0.979841248868734, "kl": 0.0506591796875, "learning_rate": 1.52340570622144e-07, "loss": 0.0094, "reward": 4.34375, "reward_std": 1.0341877937316895, "rewards/accuracy_reward_staging": 0.46875, "rewards/format_reward": 1.0, "rewards/format_reward_staging": 1.0, "step": 303 }, { "clip_ratio": 0.0, "completion_length": 645.0, "epoch": 75.88888888888889, "grad_norm": 1.2907279139619887, "kl": 0.05084228515625, "learning_rate": 1.493816078637557e-07, "loss": 0.0349, "reward": 4.03125, "reward_std": 2.768365204334259, "rewards/accuracy_reward_staging": 0.4375, "rewards/format_reward": 0.90625, "rewards/format_reward_staging": 0.9375, "step": 304 }, { "clip_ratio": 0.0, "completion_length": 612.03125, "epoch": 76.22222222222223, "grad_norm": 1.3052082852261886, "kl": 0.06219482421875, "learning_rate": 1.4644660940672627e-07, "loss": 0.0241, "reward": 3.90625, "reward_std": 1.5625, "rewards/accuracy_reward_staging": 0.40625, "rewards/format_reward": 0.875, "rewards/format_reward_staging": 1.0, "step": 305 }, { "clip_ratio": 0.0, "completion_length": 552.0, "epoch": 76.44444444444444, "grad_norm": 1.6816507380806482, "kl": 0.0640869140625, "learning_rate": 1.435357758543015e-07, "loss": 0.0623, "reward": 3.5, "reward_std": 2.3343209326267242, "rewards/accuracy_reward_staging": 0.34375, "rewards/format_reward": 0.875, "rewards/format_reward_staging": 0.90625, "step": 306 }, { "clip_ratio": 0.0, "completion_length": 605.96875, "epoch": 76.66666666666667, "grad_norm": 1.7963549332670843, "kl": 0.05462646484375, "learning_rate": 1.4064930615808806e-07, "loss": -0.0141, "reward": 3.90625, "reward_std": 3.359531879425049, "rewards/accuracy_reward_staging": 0.40625, "rewards/format_reward": 0.90625, "rewards/format_reward_staging": 0.96875, "step": 307 }, { "clip_ratio": 0.0, "completion_length": 559.5625, "epoch": 76.88888888888889, "grad_norm": 1.3270350222684457, "kl": 0.0548095703125, "learning_rate": 1.3778739760445552e-07, "loss": 0.0232, "reward": 3.53125, "reward_std": 2.031329423189163, "rewards/accuracy_reward_staging": 0.34375, "rewards/format_reward": 0.875, "rewards/format_reward_staging": 0.9375, "step": 308 }, { "clip_ratio": 0.0, "completion_length": 574.9375, "epoch": 77.22222222222223, "grad_norm": 1.403581677625956, "kl": 0.0579833984375, "learning_rate": 1.349502458010519e-07, "loss": 0.0045, "reward": 3.40625, "reward_std": 1.5280899405479431, "rewards/accuracy_reward_staging": 0.3125, "rewards/format_reward": 0.90625, "rewards/format_reward_staging": 0.9375, "step": 309 }, { "clip_ratio": 0.0, "completion_length": 571.4375, "epoch": 77.44444444444444, "grad_norm": 1.4518085181139868, "kl": 0.05694580078125, "learning_rate": 1.321380446634342e-07, "loss": -0.0332, "reward": 4.53125, "reward_std": 2.796904981136322, "rewards/accuracy_reward_staging": 0.5625, "rewards/format_reward": 0.8125, "rewards/format_reward_staging": 0.90625, "step": 310 }, { "clip_ratio": 0.0, "completion_length": 592.375, "epoch": 77.66666666666667, "grad_norm": 1.4193852483613092, "kl": 0.04937744140625, "learning_rate": 1.2935098640181457e-07, "loss": 0.0097, "reward": 3.71875, "reward_std": 1.6591877937316895, "rewards/accuracy_reward_staging": 0.34375, "rewards/format_reward": 1.0, "rewards/format_reward_staging": 1.0, "step": 311 }, { "clip_ratio": 0.0, "completion_length": 576.5625, "epoch": 77.88888888888889, "grad_norm": 1.5563017115814217, "kl": 0.055419921875, "learning_rate": 1.2658926150792322e-07, "loss": 0.0595, "reward": 4.03125, "reward_std": 2.8135814666748047, "rewards/accuracy_reward_staging": 0.4375, "rewards/format_reward": 0.875, "rewards/format_reward_staging": 0.96875, "step": 312 }, { "clip_ratio": 0.0, "completion_length": 618.15625, "epoch": 78.22222222222223, "grad_norm": 1.7591845192775064, "kl": 0.05523681640625, "learning_rate": 1.2385305874198775e-07, "loss": -0.0554, "reward": 2.625, "reward_std": 1.8215623199939728, "rewards/accuracy_reward_staging": 0.1875, "rewards/format_reward": 0.78125, "rewards/format_reward_staging": 0.90625, "step": 313 }, { "clip_ratio": 0.0, "completion_length": 554.40625, "epoch": 78.44444444444444, "grad_norm": 1.532211221198721, "kl": 0.04986572265625, "learning_rate": 1.2114256511983274e-07, "loss": 0.0323, "reward": 4.40625, "reward_std": 2.975598633289337, "rewards/accuracy_reward_staging": 0.5, "rewards/format_reward": 0.9375, "rewards/format_reward_staging": 0.96875, "step": 314 }, { "clip_ratio": 0.0, "completion_length": 563.71875, "epoch": 78.66666666666667, "grad_norm": 1.827273461906554, "kl": 0.054931640625, "learning_rate": 1.1845796590009683e-07, "loss": 0.1089, "reward": 4.28125, "reward_std": 2.9560980796813965, "rewards/accuracy_reward_staging": 0.5, "rewards/format_reward": 0.875, "rewards/format_reward_staging": 0.90625, "step": 315 }, { "clip_ratio": 0.0, "completion_length": 607.125, "epoch": 78.88888888888889, "grad_norm": 1.5705329008308428, "kl": 0.05218505859375, "learning_rate": 1.1579944457157059e-07, "loss": 0.0714, "reward": 3.53125, "reward_std": 2.3595376014709473, "rewards/accuracy_reward_staging": 0.375, "rewards/format_reward": 0.8125, "rewards/format_reward_staging": 0.84375, "step": 316 }, { "clip_ratio": 0.0, "completion_length": 576.59375, "epoch": 79.22222222222223, "grad_norm": 1.2784333227971698, "kl": 0.04998779296875, "learning_rate": 1.1316718284065535e-07, "loss": -0.0327, "reward": 3.1875, "reward_std": 1.75, "rewards/accuracy_reward_staging": 0.25, "rewards/format_reward": 0.96875, "rewards/format_reward_staging": 0.96875, "step": 317 }, { "clip_ratio": 0.0, "completion_length": 598.6875, "epoch": 79.44444444444444, "grad_norm": 1.2996658268690655, "kl": 0.05255126953125, "learning_rate": 1.1056136061894384e-07, "loss": -0.0387, "reward": 4.5, "reward_std": 1.6467358469963074, "rewards/accuracy_reward_staging": 0.53125, "rewards/format_reward": 0.875, "rewards/format_reward_staging": 0.96875, "step": 318 }, { "clip_ratio": 0.0, "completion_length": 576.25, "epoch": 79.66666666666667, "grad_norm": 1.5574614421463486, "kl": 0.04937744140625, "learning_rate": 1.0798215601092353e-07, "loss": 0.0303, "reward": 4.375, "reward_std": 2.325068473815918, "rewards/accuracy_reward_staging": 0.5, "rewards/format_reward": 0.90625, "rewards/format_reward_staging": 0.96875, "step": 319 }, { "clip_ratio": 0.0, "completion_length": 626.0625, "epoch": 79.88888888888889, "grad_norm": 1.5754896085780978, "kl": 0.0546875, "learning_rate": 1.0542974530180327e-07, "loss": 0.0137, "reward": 4.1875, "reward_std": 2.3722406029701233, "rewards/accuracy_reward_staging": 0.46875, "rewards/format_reward": 0.90625, "rewards/format_reward_staging": 0.9375, "step": 320 }, { "clip_ratio": 0.0, "completion_length": 550.71875, "epoch": 80.22222222222223, "grad_norm": 1.610726394417215, "kl": 0.06024169921875, "learning_rate": 1.0290430294546448e-07, "loss": 0.013, "reward": 3.90625, "reward_std": 2.4335986375808716, "rewards/accuracy_reward_staging": 0.40625, "rewards/format_reward": 0.9375, "rewards/format_reward_staging": 0.9375, "step": 321 }, { "clip_ratio": 0.0, "completion_length": 655.25, "epoch": 80.44444444444444, "grad_norm": 1.4911116339078845, "kl": 0.0523681640625, "learning_rate": 1.0040600155253764e-07, "loss": 0.0332, "reward": 2.78125, "reward_std": 1.3004322350025177, "rewards/accuracy_reward_staging": 0.1875, "rewards/format_reward": 0.875, "rewards/format_reward_staging": 0.96875, "step": 322 }, { "clip_ratio": 0.0, "completion_length": 519.3125, "epoch": 80.66666666666667, "grad_norm": 1.6907992973095978, "kl": 0.0538330078125, "learning_rate": 9.793501187860431e-08, "loss": -0.0401, "reward": 4.0, "reward_std": 2.362515449523926, "rewards/accuracy_reward_staging": 0.40625, "rewards/format_reward": 0.96875, "rewards/format_reward_staging": 1.0, "step": 323 }, { "clip_ratio": 0.0, "completion_length": 538.5, "epoch": 80.88888888888889, "grad_norm": 1.646427054544714, "kl": 0.063232421875, "learning_rate": 9.549150281252632e-08, "loss": -0.0039, "reward": 4.15625, "reward_std": 2.2053900957107544, "rewards/accuracy_reward_staging": 0.4375, "rewards/format_reward": 0.96875, "rewards/format_reward_staging": 1.0, "step": 324 }, { "clip_ratio": 0.0, "completion_length": 562.03125, "epoch": 81.22222222222223, "grad_norm": 1.2401904323679076, "kl": 0.059814453125, "learning_rate": 9.307564136490254e-08, "loss": 0.0337, "reward": 2.6875, "reward_std": 1.4073790609836578, "rewards/accuracy_reward_staging": 0.15625, "rewards/format_reward": 0.90625, "rewards/format_reward_staging": 1.0, "step": 325 }, { "clip_ratio": 0.0, "completion_length": 593.3125, "epoch": 81.44444444444444, "grad_norm": 1.5857599353606664, "kl": 0.0521240234375, "learning_rate": 9.068759265665382e-08, "loss": 0.0031, "reward": 3.46875, "reward_std": 2.041439712047577, "rewards/accuracy_reward_staging": 0.3125, "rewards/format_reward": 0.9375, "rewards/format_reward_staging": 0.96875, "step": 326 }, { "clip_ratio": 0.0, "completion_length": 636.15625, "epoch": 81.66666666666667, "grad_norm": 1.464039724178544, "kl": 0.04815673828125, "learning_rate": 8.832751990773712e-08, "loss": -0.033, "reward": 4.0625, "reward_std": 2.3850997388362885, "rewards/accuracy_reward_staging": 0.4375, "rewards/format_reward": 0.9375, "rewards/format_reward_staging": 0.9375, "step": 327 }, { "clip_ratio": 0.0, "completion_length": 550.625, "epoch": 81.88888888888889, "grad_norm": 1.546052526056493, "kl": 0.05743408203125, "learning_rate": 8.599558442598998e-08, "loss": 0.0427, "reward": 4.15625, "reward_std": 2.8091025352478027, "rewards/accuracy_reward_staging": 0.5, "rewards/format_reward": 0.78125, "rewards/format_reward_staging": 0.875, "step": 328 }, { "clip_ratio": 0.0, "completion_length": 592.53125, "epoch": 82.22222222222223, "grad_norm": 1.359491111417973, "kl": 0.05621337890625, "learning_rate": 8.369194559610481e-08, "loss": 0.0752, "reward": 3.03125, "reward_std": 1.4954701960086823, "rewards/accuracy_reward_staging": 0.25, "rewards/format_reward": 0.875, "rewards/format_reward_staging": 0.90625, "step": 329 }, { "clip_ratio": 0.0, "completion_length": 608.78125, "epoch": 82.44444444444444, "grad_norm": 1.4795564527051304, "kl": 0.05267333984375, "learning_rate": 8.141676086873573e-08, "loss": 0.0759, "reward": 3.28125, "reward_std": 1.9649099707603455, "rewards/accuracy_reward_staging": 0.28125, "rewards/format_reward": 0.9375, "rewards/format_reward_staging": 0.9375, "step": 330 }, { "clip_ratio": 0.0, "completion_length": 583.0625, "epoch": 82.66666666666667, "grad_norm": 1.6241400838987166, "kl": 0.0562744140625, "learning_rate": 7.917018574973644e-08, "loss": 0.0196, "reward": 4.40625, "reward_std": 2.2960872054100037, "rewards/accuracy_reward_staging": 0.53125, "rewards/format_reward": 0.78125, "rewards/format_reward_staging": 0.96875, "step": 331 }, { "clip_ratio": 0.0, "completion_length": 553.5, "epoch": 82.88888888888889, "grad_norm": 1.5339258798115873, "kl": 0.0474853515625, "learning_rate": 7.695237378953224e-08, "loss": -0.0209, "reward": 4.5, "reward_std": 2.332531690597534, "rewards/accuracy_reward_staging": 0.5, "rewards/format_reward": 1.0, "rewards/format_reward_staging": 1.0, "step": 332 }, { "clip_ratio": 0.0, "completion_length": 588.34375, "epoch": 83.22222222222223, "grad_norm": 1.5601324006274968, "kl": 0.05804443359375, "learning_rate": 7.476347657262455e-08, "loss": -0.039, "reward": 4.71875, "reward_std": 2.439529001712799, "rewards/accuracy_reward_staging": 0.5625, "rewards/format_reward": 0.9375, "rewards/format_reward_staging": 0.96875, "step": 333 }, { "clip_ratio": 0.0, "completion_length": 558.25, "epoch": 83.44444444444444, "grad_norm": 1.5625484764568953, "kl": 0.05841064453125, "learning_rate": 7.260364370723043e-08, "loss": -0.0022, "reward": 3.875, "reward_std": 2.6049662828445435, "rewards/accuracy_reward_staging": 0.40625, "rewards/format_reward": 0.875, "rewards/format_reward_staging": 0.96875, "step": 334 }, { "clip_ratio": 0.0, "completion_length": 538.53125, "epoch": 83.66666666666667, "grad_norm": 1.6578933804792892, "kl": 0.064697265625, "learning_rate": 7.047302281505735e-08, "loss": 0.0178, "reward": 3.6875, "reward_std": 1.93262779712677, "rewards/accuracy_reward_staging": 0.34375, "rewards/format_reward": 0.96875, "rewards/format_reward_staging": 1.0, "step": 335 }, { "clip_ratio": 0.0, "completion_length": 579.3125, "epoch": 83.88888888888889, "grad_norm": 1.737744709396854, "kl": 0.05303955078125, "learning_rate": 6.837175952121304e-08, "loss": -0.056, "reward": 3.875, "reward_std": 2.5176164507865906, "rewards/accuracy_reward_staging": 0.40625, "rewards/format_reward": 0.875, "rewards/format_reward_staging": 0.96875, "step": 336 }, { "clip_ratio": 0.0, "completion_length": 559.25, "epoch": 84.22222222222223, "grad_norm": 1.5777065212148367, "kl": 0.0582275390625, "learning_rate": 6.629999744425235e-08, "loss": -0.0542, "reward": 3.3125, "reward_std": 1.8360159397125244, "rewards/accuracy_reward_staging": 0.3125, "rewards/format_reward": 0.84375, "rewards/format_reward_staging": 0.90625, "step": 337 }, { "clip_ratio": 0.0, "completion_length": 604.03125, "epoch": 84.44444444444444, "grad_norm": 1.7153117337295096, "kl": 0.05419921875, "learning_rate": 6.42578781863613e-08, "loss": 0.0782, "reward": 3.625, "reward_std": 3.0492074489593506, "rewards/accuracy_reward_staging": 0.375, "rewards/format_reward": 0.84375, "rewards/format_reward_staging": 0.90625, "step": 338 }, { "clip_ratio": 0.0, "completion_length": 591.3125, "epoch": 84.66666666666667, "grad_norm": 1.3347688495066687, "kl": 0.053466796875, "learning_rate": 6.22455413236786e-08, "loss": -0.0014, "reward": 3.03125, "reward_std": 1.389709249138832, "rewards/accuracy_reward_staging": 0.25, "rewards/format_reward": 0.875, "rewards/format_reward_staging": 0.90625, "step": 339 }, { "clip_ratio": 0.0, "completion_length": 633.6875, "epoch": 84.88888888888889, "grad_norm": 1.389754239090955, "kl": 0.04827880859375, "learning_rate": 6.026312439675551e-08, "loss": 0.0256, "reward": 4.25, "reward_std": 2.171033263206482, "rewards/accuracy_reward_staging": 0.46875, "rewards/format_reward": 0.9375, "rewards/format_reward_staging": 0.96875, "step": 340 }, { "clip_ratio": 0.0, "completion_length": 550.59375, "epoch": 85.22222222222223, "grad_norm": 1.7000896266286212, "kl": 0.06982421875, "learning_rate": 5.831076290115572e-08, "loss": 0.0243, "reward": 4.15625, "reward_std": 2.343973159790039, "rewards/accuracy_reward_staging": 0.46875, "rewards/format_reward": 0.84375, "rewards/format_reward_staging": 0.96875, "step": 341 }, { "clip_ratio": 0.0, "completion_length": 600.8125, "epoch": 85.44444444444444, "grad_norm": 1.4833240972495387, "kl": 0.056884765625, "learning_rate": 5.638859027819409e-08, "loss": 0.0553, "reward": 3.53125, "reward_std": 2.29950013756752, "rewards/accuracy_reward_staging": 0.34375, "rewards/format_reward": 0.875, "rewards/format_reward_staging": 0.9375, "step": 342 }, { "clip_ratio": 0.0, "completion_length": 650.875, "epoch": 85.66666666666667, "grad_norm": 1.3618290470634058, "kl": 0.04998779296875, "learning_rate": 5.44967379058161e-08, "loss": -0.0017, "reward": 5.0, "reward_std": 2.3323360979557037, "rewards/accuracy_reward_staging": 0.625, "rewards/format_reward": 0.9375, "rewards/format_reward_staging": 0.9375, "step": 343 }, { "clip_ratio": 0.0, "completion_length": 576.75, "epoch": 85.88888888888889, "grad_norm": 1.749186308713559, "kl": 0.05303955078125, "learning_rate": 5.263533508961826e-08, "loss": 0.0794, "reward": 3.34375, "reward_std": 2.082039564847946, "rewards/accuracy_reward_staging": 0.3125, "rewards/format_reward": 0.84375, "rewards/format_reward_staging": 0.9375, "step": 344 }, { "clip_ratio": 0.0, "completion_length": 573.3125, "epoch": 86.22222222222223, "grad_norm": 1.4322968854479468, "kl": 0.05615234375, "learning_rate": 5.080450905401057e-08, "loss": 0.0153, "reward": 4.25, "reward_std": 1.8755539804697037, "rewards/accuracy_reward_staging": 0.46875, "rewards/format_reward": 0.9375, "rewards/format_reward_staging": 0.96875, "step": 345 }, { "clip_ratio": 0.0, "completion_length": 546.59375, "epoch": 86.44444444444444, "grad_norm": 1.2682803454826486, "kl": 0.053955078125, "learning_rate": 4.9004384933520547e-08, "loss": 0.0083, "reward": 3.53125, "reward_std": 1.3726893961429596, "rewards/accuracy_reward_staging": 0.3125, "rewards/format_reward": 0.96875, "rewards/format_reward_staging": 1.0, "step": 346 }, { "clip_ratio": 0.0, "completion_length": 580.21875, "epoch": 86.66666666666667, "grad_norm": 1.5693179766123742, "kl": 0.05389404296875, "learning_rate": 4.723508576424062e-08, "loss": -0.0063, "reward": 3.46875, "reward_std": 2.777799040079117, "rewards/accuracy_reward_staging": 0.34375, "rewards/format_reward": 0.84375, "rewards/format_reward_staging": 0.90625, "step": 347 }, { "clip_ratio": 0.0, "completion_length": 595.375, "epoch": 86.88888888888889, "grad_norm": 1.6314212035379032, "kl": 0.053955078125, "learning_rate": 4.549673247541874e-08, "loss": -0.01, "reward": 4.15625, "reward_std": 2.311874210834503, "rewards/accuracy_reward_staging": 0.46875, "rewards/format_reward": 0.875, "rewards/format_reward_staging": 0.9375, "step": 348 }, { "clip_ratio": 0.0, "completion_length": 568.71875, "epoch": 87.22222222222223, "grad_norm": 1.658360075079596, "kl": 0.0572509765625, "learning_rate": 4.37894438811931e-08, "loss": 0.0064, "reward": 3.5625, "reward_std": 2.875, "rewards/accuracy_reward_staging": 0.34375, "rewards/format_reward": 0.84375, "rewards/format_reward_staging": 1.0, "step": 349 }, { "epoch": 87.44444444444444, "grad_norm": 1.4586421894209247, "learning_rate": 4.2113336672471245e-08, "loss": 0.0579, "step": 350 }, { "epoch": 87.44444444444444, "eval_clip_ratio": 0.0, "eval_completion_length": 609.05, "eval_kl": 0.052783203125, "eval_loss": 0.033656854182481766, "eval_reward": 2.45, "eval_reward_std": 1.6229771614074706, "eval_rewards/accuracy_reward_staging": 0.15, "eval_rewards/format_reward": 0.825, "eval_rewards/format_reward_staging": 0.875, "eval_runtime": 55.2193, "eval_samples_per_second": 0.652, "eval_steps_per_second": 0.091, "step": 350 }, { "clip_ratio": 0.0, "completion_length": 609.578125, "epoch": 87.66666666666667, "grad_norm": 1.6263684105864271, "kl": 0.0521240234375, "learning_rate": 4.0468525408954456e-08, "loss": 0.0832, "reward": 4.265625, "reward_std": 2.632143199443817, "rewards/accuracy_reward_staging": 0.484375, "rewards/format_reward": 0.90625, "rewards/format_reward_staging": 0.9375, "step": 351 }, { "clip_ratio": 0.0, "completion_length": 630.0, "epoch": 87.88888888888889, "grad_norm": 1.406310532139818, "kl": 0.0509033203125, "learning_rate": 3.8855122511307626e-08, "loss": 0.0517, "reward": 3.0625, "reward_std": 1.3608438968658447, "rewards/accuracy_reward_staging": 0.21875, "rewards/format_reward": 0.96875, "rewards/format_reward_staging": 1.0, "step": 352 }, { "clip_ratio": 0.0, "completion_length": 592.46875, "epoch": 88.22222222222223, "grad_norm": 1.505022624138408, "kl": 0.05255126953125, "learning_rate": 3.727323825347578e-08, "loss": 0.0469, "reward": 4.25, "reward_std": 2.023455113172531, "rewards/accuracy_reward_staging": 0.46875, "rewards/format_reward": 0.9375, "rewards/format_reward_staging": 0.96875, "step": 353 }, { "clip_ratio": 0.0, "completion_length": 570.21875, "epoch": 88.44444444444444, "grad_norm": 1.7398525080011868, "kl": 0.051025390625, "learning_rate": 3.572298075514652e-08, "loss": 0.0079, "reward": 5.25, "reward_std": 2.496154248714447, "rewards/accuracy_reward_staging": 0.65625, "rewards/format_reward": 1.0, "rewards/format_reward_staging": 0.96875, "step": 354 }, { "clip_ratio": 0.0, "completion_length": 609.75, "epoch": 88.66666666666667, "grad_norm": 1.4985931737207225, "kl": 0.05255126953125, "learning_rate": 3.420445597436056e-08, "loss": 0.0262, "reward": 4.09375, "reward_std": 2.1352776885032654, "rewards/accuracy_reward_staging": 0.4375, "rewards/format_reward": 0.9375, "rewards/format_reward_staging": 0.96875, "step": 355 }, { "clip_ratio": 0.0, "completion_length": 673.75, "epoch": 88.88888888888889, "grad_norm": 1.4906376909879282, "kl": 0.05889892578125, "learning_rate": 3.271776770026963e-08, "loss": 0.0716, "reward": 3.28125, "reward_std": 2.086387515068054, "rewards/accuracy_reward_staging": 0.3125, "rewards/format_reward": 0.78125, "rewards/format_reward_staging": 0.9375, "step": 356 }, { "clip_ratio": 0.0, "completion_length": 549.53125, "epoch": 89.22222222222223, "grad_norm": 1.87310658244872, "kl": 0.08197021484375, "learning_rate": 3.1263017546042326e-08, "loss": 0.0395, "reward": 3.90625, "reward_std": 2.444858193397522, "rewards/accuracy_reward_staging": 0.40625, "rewards/format_reward": 0.875, "rewards/format_reward_staging": 1.0, "step": 357 }, { "clip_ratio": 0.0, "completion_length": 570.34375, "epoch": 89.44444444444444, "grad_norm": 1.6047992153607105, "kl": 0.05328369140625, "learning_rate": 2.9840304941919416e-08, "loss": 0.0128, "reward": 4.09375, "reward_std": 3.0098507404327393, "rewards/accuracy_reward_staging": 0.4375, "rewards/format_reward": 0.9375, "rewards/format_reward_staging": 0.96875, "step": 358 }, { "clip_ratio": 0.0, "completion_length": 588.09375, "epoch": 89.66666666666667, "grad_norm": 1.3977481210272724, "kl": 0.0628662109375, "learning_rate": 2.8449727128417367e-08, "loss": 0.0184, "reward": 3.6875, "reward_std": 1.197430670261383, "rewards/accuracy_reward_staging": 0.40625, "rewards/format_reward": 0.78125, "rewards/format_reward_staging": 0.875, "step": 359 }, { "clip_ratio": 0.0, "completion_length": 663.0625, "epoch": 89.88888888888889, "grad_norm": 1.521856541607207, "kl": 0.04925537109375, "learning_rate": 2.7091379149682682e-08, "loss": -0.0485, "reward": 4.34375, "reward_std": 2.930923640727997, "rewards/accuracy_reward_staging": 0.46875, "rewards/format_reward": 1.0, "rewards/format_reward_staging": 1.0, "step": 360 }, { "clip_ratio": 0.0, "completion_length": 585.0625, "epoch": 90.22222222222223, "grad_norm": 1.8447806012116532, "kl": 0.05377197265625, "learning_rate": 2.5765353846995297e-08, "loss": 0.049, "reward": 4.53125, "reward_std": 3.1164740920066833, "rewards/accuracy_reward_staging": 0.5625, "rewards/format_reward": 0.8125, "rewards/format_reward_staging": 0.90625, "step": 361 }, { "clip_ratio": 0.0, "completion_length": 650.21875, "epoch": 90.44444444444444, "grad_norm": 1.184344841195356, "kl": 0.04827880859375, "learning_rate": 2.4471741852423233e-08, "loss": 0.0432, "reward": 2.96875, "reward_std": 1.5483438968658447, "rewards/accuracy_reward_staging": 0.21875, "rewards/format_reward": 0.9375, "rewards/format_reward_staging": 0.9375, "step": 362 }, { "clip_ratio": 0.0, "completion_length": 547.875, "epoch": 90.66666666666667, "grad_norm": 1.4816241418571312, "kl": 0.065185546875, "learning_rate": 2.3210631582627927e-08, "loss": -0.007, "reward": 4.53125, "reward_std": 2.563981920480728, "rewards/accuracy_reward_staging": 0.53125, "rewards/format_reward": 0.90625, "rewards/format_reward_staging": 0.96875, "step": 363 }, { "clip_ratio": 0.0, "completion_length": 646.875, "epoch": 90.88888888888889, "grad_norm": 1.4324587884354845, "kl": 0.0557861328125, "learning_rate": 2.1982109232821176e-08, "loss": 0.0456, "reward": 4.6875, "reward_std": 2.595020294189453, "rewards/accuracy_reward_staging": 0.5625, "rewards/format_reward": 0.90625, "rewards/format_reward_staging": 0.96875, "step": 364 }, { "clip_ratio": 0.0, "completion_length": 609.875, "epoch": 91.22222222222223, "grad_norm": 1.406708270283001, "kl": 0.04571533203125, "learning_rate": 2.0786258770873645e-08, "loss": -0.0323, "reward": 5.28125, "reward_std": 2.358702301979065, "rewards/accuracy_reward_staging": 0.65625, "rewards/format_reward": 1.0, "rewards/format_reward_staging": 1.0, "step": 365 }, { "clip_ratio": 0.0, "completion_length": 625.96875, "epoch": 91.44444444444444, "grad_norm": 1.254136427080868, "kl": 0.0477294921875, "learning_rate": 1.9623161931575926e-08, "loss": 0.0391, "reward": 4.25, "reward_std": 1.3912444412708282, "rewards/accuracy_reward_staging": 0.46875, "rewards/format_reward": 0.9375, "rewards/format_reward_staging": 0.96875, "step": 366 }, { "clip_ratio": 0.0, "completion_length": 584.8125, "epoch": 91.66666666666667, "grad_norm": 1.4411232733747643, "kl": 0.057861328125, "learning_rate": 1.849289821105199e-08, "loss": 0.0171, "reward": 3.125, "reward_std": 1.5756275057792664, "rewards/accuracy_reward_staging": 0.25, "rewards/format_reward": 0.9375, "rewards/format_reward_staging": 0.9375, "step": 367 }, { "clip_ratio": 0.0, "completion_length": 629.1875, "epoch": 91.88888888888889, "grad_norm": 1.1371389339839404, "kl": 0.051513671875, "learning_rate": 1.7395544861325718e-08, "loss": 0.011, "reward": 3.53125, "reward_std": 1.816932737827301, "rewards/accuracy_reward_staging": 0.3125, "rewards/format_reward": 0.96875, "rewards/format_reward_staging": 1.0, "step": 368 }, { "clip_ratio": 0.0, "completion_length": 615.21875, "epoch": 92.22222222222223, "grad_norm": 1.2832104145352503, "kl": 0.046142578125, "learning_rate": 1.6331176885040876e-08, "loss": 0.0567, "reward": 3.78125, "reward_std": 1.9511407613754272, "rewards/accuracy_reward_staging": 0.375, "rewards/format_reward": 0.9375, "rewards/format_reward_staging": 0.96875, "step": 369 }, { "clip_ratio": 0.0, "completion_length": 561.375, "epoch": 92.44444444444444, "grad_norm": 1.4397679570391773, "kl": 0.05340576171875, "learning_rate": 1.5299867030334813e-08, "loss": 0.0089, "reward": 3.1875, "reward_std": 1.2975594997406006, "rewards/accuracy_reward_staging": 0.25, "rewards/format_reward": 0.9375, "rewards/format_reward_staging": 1.0, "step": 370 }, { "clip_ratio": 0.0, "completion_length": 547.0625, "epoch": 92.66666666666667, "grad_norm": 1.5669540097130739, "kl": 0.066650390625, "learning_rate": 1.4301685785866213e-08, "loss": -0.0198, "reward": 4.46875, "reward_std": 2.9167675375938416, "rewards/accuracy_reward_staging": 0.5, "rewards/format_reward": 0.96875, "rewards/format_reward_staging": 1.0, "step": 371 }, { "clip_ratio": 0.0, "completion_length": 588.1875, "epoch": 92.88888888888889, "grad_norm": 1.6359800713030668, "kl": 0.05194091796875, "learning_rate": 1.3336701375997127e-08, "loss": 0.0226, "reward": 4.1875, "reward_std": 2.957531690597534, "rewards/accuracy_reward_staging": 0.4375, "rewards/format_reward": 1.0, "rewards/format_reward_staging": 1.0, "step": 372 }, { "clip_ratio": 0.0, "completion_length": 596.375, "epoch": 93.22222222222223, "grad_norm": 22.007037588121534, "kl": 0.2752685546875, "learning_rate": 1.240497975613014e-08, "loss": -0.0325, "reward": 3.75, "reward_std": 1.8229495882987976, "rewards/accuracy_reward_staging": 0.40625, "rewards/format_reward": 0.8125, "rewards/format_reward_staging": 0.90625, "step": 373 }, { "clip_ratio": 0.0, "completion_length": 554.75, "epoch": 93.44444444444444, "grad_norm": 1.5726784994170007, "kl": 0.05316162109375, "learning_rate": 1.1506584608200364e-08, "loss": 0.0904, "reward": 2.75, "reward_std": 1.680722177028656, "rewards/accuracy_reward_staging": 0.1875, "rewards/format_reward": 0.90625, "rewards/format_reward_staging": 0.90625, "step": 374 }, { "clip_ratio": 0.0, "completion_length": 597.53125, "epoch": 93.66666666666667, "grad_norm": 1.2719220301062524, "kl": 0.05828857421875, "learning_rate": 1.0641577336322761e-08, "loss": 0.0199, "reward": 4.96875, "reward_std": 2.2166852056980133, "rewards/accuracy_reward_staging": 0.625, "rewards/format_reward": 0.875, "rewards/format_reward_staging": 0.96875, "step": 375 }, { "clip_ratio": 0.0, "completion_length": 640.1875, "epoch": 93.88888888888889, "grad_norm": 2.653449397011027, "kl": 0.07366943359375, "learning_rate": 9.810017062595321e-09, "loss": 0.0336, "reward": 4.0, "reward_std": 2.329674154520035, "rewards/accuracy_reward_staging": 0.4375, "rewards/format_reward": 0.875, "rewards/format_reward_staging": 0.9375, "step": 376 }, { "clip_ratio": 0.0, "completion_length": 511.21875, "epoch": 94.22222222222223, "grad_norm": 1.4821827826071337, "kl": 0.04779052734375, "learning_rate": 9.011960623058201e-09, "loss": -0.0241, "reward": 4.53125, "reward_std": 1.9632892608642578, "rewards/accuracy_reward_staging": 0.53125, "rewards/format_reward": 0.90625, "rewards/format_reward_staging": 0.96875, "step": 377 }, { "clip_ratio": 0.0, "completion_length": 516.34375, "epoch": 94.44444444444444, "grad_norm": 1.424328758939937, "kl": 0.055419921875, "learning_rate": 8.247462563808816e-09, "loss": 0.018, "reward": 4.46875, "reward_std": 2.4695461988449097, "rewards/accuracy_reward_staging": 0.5, "rewards/format_reward": 0.96875, "rewards/format_reward_staging": 1.0, "step": 378 }, { "clip_ratio": 0.0, "completion_length": 630.75, "epoch": 94.66666666666667, "grad_norm": 1.3627094949939382, "kl": 0.05291748046875, "learning_rate": 7.516575137274162e-09, "loss": 0.05, "reward": 3.9375, "reward_std": 2.0698782801628113, "rewards/accuracy_reward_staging": 0.40625, "rewards/format_reward": 0.9375, "rewards/format_reward_staging": 0.96875, "step": 379 }, { "clip_ratio": 0.0, "completion_length": 632.625, "epoch": 94.88888888888889, "grad_norm": 1.1699961416103346, "kl": 0.05120849609375, "learning_rate": 6.819348298638839e-09, "loss": 0.0182, "reward": 3.1875, "reward_std": 2.152123808860779, "rewards/accuracy_reward_staging": 0.25, "rewards/format_reward": 0.96875, "rewards/format_reward_staging": 0.96875, "step": 380 }, { "clip_ratio": 0.0, "completion_length": 618.8125, "epoch": 95.22222222222223, "grad_norm": 1.362463533881549, "kl": 0.06512451171875, "learning_rate": 6.15582970243117e-09, "loss": 0.0677, "reward": 3.375, "reward_std": 1.7910222113132477, "rewards/accuracy_reward_staging": 0.3125, "rewards/format_reward": 0.875, "rewards/format_reward_staging": 0.9375, "step": 381 }, { "clip_ratio": 0.0, "completion_length": 551.15625, "epoch": 95.44444444444444, "grad_norm": 1.297999563247604, "kl": 0.06048583984375, "learning_rate": 5.526064699265753e-09, "loss": 0.0032, "reward": 3.96875, "reward_std": 1.8312554359436035, "rewards/accuracy_reward_staging": 0.40625, "rewards/format_reward": 0.96875, "rewards/format_reward_staging": 0.96875, "step": 382 }, { "clip_ratio": 0.0, "completion_length": 604.625, "epoch": 95.66666666666667, "grad_norm": 1.486555198053525, "kl": 0.05755615234375, "learning_rate": 4.9300963327441044e-09, "loss": 0.043, "reward": 4.40625, "reward_std": 3.2432121634483337, "rewards/accuracy_reward_staging": 0.5, "rewards/format_reward": 0.9375, "rewards/format_reward_staging": 0.96875, "step": 383 }, { "clip_ratio": 0.0, "completion_length": 583.875, "epoch": 95.88888888888889, "grad_norm": 1.5255994079961044, "kl": 0.05120849609375, "learning_rate": 4.367965336512403e-09, "loss": -0.0079, "reward": 3.8125, "reward_std": 1.9035333096981049, "rewards/accuracy_reward_staging": 0.375, "rewards/format_reward": 0.9375, "rewards/format_reward_staging": 1.0, "step": 384 }, { "clip_ratio": 0.0, "completion_length": 500.53125, "epoch": 96.22222222222223, "grad_norm": 1.3113670847804533, "kl": 0.05438232421875, "learning_rate": 3.8397101314774915e-09, "loss": -0.0184, "reward": 3.28125, "reward_std": 1.0818375647068024, "rewards/accuracy_reward_staging": 0.28125, "rewards/format_reward": 0.9375, "rewards/format_reward_staging": 0.9375, "step": 385 }, { "clip_ratio": 0.0, "completion_length": 554.8125, "epoch": 96.44444444444444, "grad_norm": 1.7546175198232294, "kl": 0.056640625, "learning_rate": 3.3453668231809283e-09, "loss": -0.0321, "reward": 5.5, "reward_std": 3.5806562304496765, "rewards/accuracy_reward_staging": 0.71875, "rewards/format_reward": 0.9375, "rewards/format_reward_staging": 0.96875, "step": 386 }, { "clip_ratio": 0.0, "completion_length": 632.9375, "epoch": 96.66666666666667, "grad_norm": 1.3830980489663667, "kl": 0.050537109375, "learning_rate": 2.8849691993311777e-09, "loss": 0.0483, "reward": 3.59375, "reward_std": 2.2675071954727173, "rewards/accuracy_reward_staging": 0.34375, "rewards/format_reward": 0.90625, "rewards/format_reward_staging": 0.96875, "step": 387 }, { "clip_ratio": 0.0, "completion_length": 565.0, "epoch": 96.88888888888889, "grad_norm": 1.6187360157512092, "kl": 0.0645751953125, "learning_rate": 2.458548727494292e-09, "loss": 0.0672, "reward": 4.1875, "reward_std": 2.5254639387130737, "rewards/accuracy_reward_staging": 0.5, "rewards/format_reward": 0.78125, "rewards/format_reward_staging": 0.90625, "step": 388 }, { "clip_ratio": 0.0, "completion_length": 601.3125, "epoch": 97.22222222222223, "grad_norm": 1.513261818035226, "kl": 0.05316162109375, "learning_rate": 2.066134552943077e-09, "loss": -0.054, "reward": 4.0, "reward_std": 2.443375587463379, "rewards/accuracy_reward_staging": 0.40625, "rewards/format_reward": 0.96875, "rewards/format_reward_staging": 1.0, "step": 389 }, { "clip_ratio": 0.0, "completion_length": 618.53125, "epoch": 97.44444444444444, "grad_norm": 1.2736528121828654, "kl": 0.04791259765625, "learning_rate": 1.7077534966650765e-09, "loss": 0.0219, "reward": 5.375, "reward_std": 2.514360010623932, "rewards/accuracy_reward_staging": 0.6875, "rewards/format_reward": 0.9375, "rewards/format_reward_staging": 1.0, "step": 390 }, { "clip_ratio": 0.0, "completion_length": 593.1875, "epoch": 97.66666666666667, "grad_norm": 1.5869628592439236, "kl": 0.0782470703125, "learning_rate": 1.383430053529422e-09, "loss": -0.01, "reward": 3.40625, "reward_std": 1.3685379922389984, "rewards/accuracy_reward_staging": 0.3125, "rewards/format_reward": 0.90625, "rewards/format_reward_staging": 0.9375, "step": 391 }, { "clip_ratio": 0.0, "completion_length": 646.5, "epoch": 97.88888888888889, "grad_norm": 1.3206401873227125, "kl": 0.052490234375, "learning_rate": 1.0931863906127325e-09, "loss": -0.0253, "reward": 3.65625, "reward_std": 1.6694981455802917, "rewards/accuracy_reward_staging": 0.34375, "rewards/format_reward": 0.96875, "rewards/format_reward_staging": 0.96875, "step": 392 }, { "clip_ratio": 0.0, "completion_length": 573.78125, "epoch": 98.22222222222223, "grad_norm": 1.8394528163274233, "kl": 0.0552978515625, "learning_rate": 8.370423456837139e-10, "loss": 0.0136, "reward": 4.625, "reward_std": 2.260310411453247, "rewards/accuracy_reward_staging": 0.5625, "rewards/format_reward": 0.84375, "rewards/format_reward_staging": 0.96875, "step": 393 }, { "clip_ratio": 0.0, "completion_length": 621.53125, "epoch": 98.44444444444444, "grad_norm": 1.5558051348782826, "kl": 0.06475830078125, "learning_rate": 6.150154258476314e-10, "loss": -0.0687, "reward": 4.71875, "reward_std": 2.5687596797943115, "rewards/accuracy_reward_staging": 0.5625, "rewards/format_reward": 0.96875, "rewards/format_reward_staging": 0.9375, "step": 394 }, { "clip_ratio": 0.0, "completion_length": 595.75, "epoch": 98.66666666666667, "grad_norm": 1.1453949481051802, "kl": 0.048095703125, "learning_rate": 4.271208063494902e-10, "loss": -0.0004, "reward": 2.5625, "reward_std": 1.1108438968658447, "rewards/accuracy_reward_staging": 0.125, "rewards/format_reward": 0.96875, "rewards/format_reward_staging": 0.96875, "step": 395 }, { "clip_ratio": 0.0, "completion_length": 612.9375, "epoch": 98.88888888888889, "grad_norm": 1.627509813072313, "kl": 0.05230712890625, "learning_rate": 2.733713295369755e-10, "loss": -0.0208, "reward": 4.375, "reward_std": 2.401917338371277, "rewards/accuracy_reward_staging": 0.5, "rewards/format_reward": 0.9375, "rewards/format_reward_staging": 0.9375, "step": 396 }, { "clip_ratio": 0.0, "completion_length": 598.03125, "epoch": 99.22222222222223, "grad_norm": 1.3542702485509543, "kl": 0.0565185546875, "learning_rate": 1.53777503982655e-10, "loss": 0.0102, "reward": 3.3125, "reward_std": 2.002065122127533, "rewards/accuracy_reward_staging": 0.28125, "rewards/format_reward": 0.90625, "rewards/format_reward_staging": 1.0, "step": 397 }, { "clip_ratio": 0.0, "completion_length": 589.84375, "epoch": 99.44444444444444, "grad_norm": 1.524546929028841, "kl": 0.06207275390625, "learning_rate": 6.834750376549791e-11, "loss": 0.0366, "reward": 4.625, "reward_std": 2.895161896944046, "rewards/accuracy_reward_staging": 0.5625, "rewards/format_reward": 0.875, "rewards/format_reward_staging": 0.9375, "step": 398 }, { "clip_ratio": 0.0, "completion_length": 563.6875, "epoch": 99.66666666666667, "grad_norm": 1.4232250672089828, "kl": 0.05712890625, "learning_rate": 1.7087167912710476e-11, "loss": 0.0203, "reward": 3.875, "reward_std": 1.875, "rewards/accuracy_reward_staging": 0.375, "rewards/format_reward": 1.0, "rewards/format_reward_staging": 1.0, "step": 399 }, { "epoch": 99.88888888888889, "grad_norm": 1.8247807880781484, "learning_rate": 0.0, "loss": 0.0974, "step": 400 }, { "epoch": 99.88888888888889, "eval_clip_ratio": 0.0, "eval_completion_length": 644.175, "eval_kl": 0.052978515625, "eval_loss": 0.008646870031952858, "eval_reward": 2.3, "eval_reward_std": 1.2995877504348754, "eval_rewards/accuracy_reward_staging": 0.125, "eval_rewards/format_reward": 0.8, "eval_rewards/format_reward_staging": 0.875, "eval_runtime": 54.9436, "eval_samples_per_second": 0.655, "eval_steps_per_second": 0.091, "step": 400 }, { "clip_ratio": 0.0, "completion_length": 498.375, "epoch": 99.88888888888889, "kl": 0.0625, "reward": 3.28125, "reward_std": 2.418270230293274, "rewards/accuracy_reward_staging": 0.28125, "rewards/format_reward": 0.9375, "rewards/format_reward_staging": 0.9375, "step": 400, "total_flos": 0.0, "train_loss": 0.01939338302021497, "train_runtime": 14839.7642, "train_samples_per_second": 0.243, "train_steps_per_second": 0.027 } ], "logging_steps": 1, "max_steps": 400, "num_input_tokens_seen": 0, "num_train_epochs": 100, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }