Qwen2.5-3B-Instruct / trainer_state.json
cameronphchen's picture
Model save
d7d50ed verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 99.88888888888889,
"eval_steps": 50,
"global_step": 400,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio": 0.0,
"completion_length": 535.125,
"epoch": 0.2222222222222222,
"grad_norm": 1.7916724271880604,
"kl": 0.0,
"learning_rate": 5e-08,
"loss": 0.0583,
"reward": 2.3125,
"reward_std": 1.1971687823534012,
"rewards/accuracy_reward_staging": 0.09375,
"rewards/format_reward": 0.875,
"rewards/format_reward_staging": 0.96875,
"step": 1
},
{
"clip_ratio": 0.0,
"completion_length": 555.90625,
"epoch": 0.4444444444444444,
"grad_norm": 1.5555075403521712,
"kl": 0.0,
"learning_rate": 1e-07,
"loss": -0.0705,
"reward": 2.5625,
"reward_std": 1.2858919501304626,
"rewards/accuracy_reward_staging": 0.15625,
"rewards/format_reward": 0.84375,
"rewards/format_reward_staging": 0.9375,
"step": 2
},
{
"clip_ratio": 0.0,
"completion_length": 541.46875,
"epoch": 0.6666666666666666,
"grad_norm": 1.6594522931688669,
"kl": 0.0010576248168945312,
"learning_rate": 1.5e-07,
"loss": -0.0235,
"reward": 2.59375,
"reward_std": 1.6232599020004272,
"rewards/accuracy_reward_staging": 0.15625,
"rewards/format_reward": 0.90625,
"rewards/format_reward_staging": 0.90625,
"step": 3
},
{
"clip_ratio": 0.0,
"completion_length": 613.25,
"epoch": 0.8888888888888888,
"grad_norm": 2.3276142189283164,
"kl": 0.0011081695556640625,
"learning_rate": 2e-07,
"loss": 0.1029,
"reward": 2.875,
"reward_std": 1.8071783781051636,
"rewards/accuracy_reward_staging": 0.21875,
"rewards/format_reward": 0.8125,
"rewards/format_reward_staging": 0.96875,
"step": 4
},
{
"clip_ratio": 0.0,
"completion_length": 649.84375,
"epoch": 1.2222222222222223,
"grad_norm": 1.5167959821278052,
"kl": 0.0010709762573242188,
"learning_rate": 2.5e-07,
"loss": 0.0003,
"reward": 2.84375,
"reward_std": 1.7606024742126465,
"rewards/accuracy_reward_staging": 0.21875,
"rewards/format_reward": 0.78125,
"rewards/format_reward_staging": 0.96875,
"step": 5
},
{
"clip_ratio": 0.0,
"completion_length": 574.25,
"epoch": 1.4444444444444444,
"grad_norm": 1.491122536644779,
"kl": 0.0009145736694335938,
"learning_rate": 3e-07,
"loss": 0.0377,
"reward": 2.75,
"reward_std": 1.8017165958881378,
"rewards/accuracy_reward_staging": 0.1875,
"rewards/format_reward": 0.90625,
"rewards/format_reward_staging": 0.90625,
"step": 6
},
{
"clip_ratio": 0.0,
"completion_length": 550.46875,
"epoch": 1.6666666666666665,
"grad_norm": 1.5321454699600687,
"kl": 0.0016422271728515625,
"learning_rate": 3.5e-07,
"loss": 0.0173,
"reward": 2.8125,
"reward_std": 1.498587191104889,
"rewards/accuracy_reward_staging": 0.1875,
"rewards/format_reward": 0.90625,
"rewards/format_reward_staging": 0.96875,
"step": 7
},
{
"clip_ratio": 0.0,
"completion_length": 530.0625,
"epoch": 1.8888888888888888,
"grad_norm": 1.7429693147530465,
"kl": 0.0010614395141601562,
"learning_rate": 4e-07,
"loss": 0.0413,
"reward": 3.15625,
"reward_std": 2.1272581219673157,
"rewards/accuracy_reward_staging": 0.28125,
"rewards/format_reward": 0.78125,
"rewards/format_reward_staging": 0.96875,
"step": 8
},
{
"clip_ratio": 0.0,
"completion_length": 643.875,
"epoch": 2.2222222222222223,
"grad_norm": 1.53726074310182,
"kl": 0.0013751983642578125,
"learning_rate": 4.5e-07,
"loss": -0.005,
"reward": 3.125,
"reward_std": 2.054091453552246,
"rewards/accuracy_reward_staging": 0.25,
"rewards/format_reward": 0.90625,
"rewards/format_reward_staging": 0.96875,
"step": 9
},
{
"clip_ratio": 0.0,
"completion_length": 614.09375,
"epoch": 2.4444444444444446,
"grad_norm": 1.3654100960829842,
"kl": 0.0012149810791015625,
"learning_rate": 5e-07,
"loss": -0.0164,
"reward": 2.59375,
"reward_std": 1.0483438968658447,
"rewards/accuracy_reward_staging": 0.125,
"rewards/format_reward": 0.96875,
"rewards/format_reward_staging": 1.0,
"step": 10
},
{
"clip_ratio": 0.0,
"completion_length": 555.3125,
"epoch": 2.6666666666666665,
"grad_norm": 1.4260001116361793,
"kl": 0.0010051727294921875,
"learning_rate": 5.5e-07,
"loss": 0.0251,
"reward": 3.0625,
"reward_std": 1.7733518332242966,
"rewards/accuracy_reward_staging": 0.25,
"rewards/format_reward": 0.84375,
"rewards/format_reward_staging": 0.96875,
"step": 11
},
{
"clip_ratio": 0.0,
"completion_length": 552.5625,
"epoch": 2.888888888888889,
"grad_norm": 1.5253120629648043,
"kl": 0.001361846923828125,
"learning_rate": 6e-07,
"loss": 0.0285,
"reward": 3.3125,
"reward_std": 1.9136751294136047,
"rewards/accuracy_reward_staging": 0.28125,
"rewards/format_reward": 0.90625,
"rewards/format_reward_staging": 1.0,
"step": 12
},
{
"clip_ratio": 0.0,
"completion_length": 579.71875,
"epoch": 3.2222222222222223,
"grad_norm": 1.5612924435198745,
"kl": 0.0019207000732421875,
"learning_rate": 6.5e-07,
"loss": 0.0829,
"reward": 2.0625,
"reward_std": 0.5475594997406006,
"rewards/accuracy_reward_staging": 0.0625,
"rewards/format_reward": 0.8125,
"rewards/format_reward_staging": 0.9375,
"step": 13
},
{
"clip_ratio": 0.0,
"completion_length": 648.125,
"epoch": 3.4444444444444446,
"grad_norm": 1.472369166378751,
"kl": 0.0019435882568359375,
"learning_rate": 7e-07,
"loss": 0.0889,
"reward": 2.3125,
"reward_std": 1.3669461011886597,
"rewards/accuracy_reward_staging": 0.125,
"rewards/format_reward": 0.84375,
"rewards/format_reward_staging": 0.84375,
"step": 14
},
{
"clip_ratio": 0.0,
"completion_length": 660.78125,
"epoch": 3.6666666666666665,
"grad_norm": 1.2833764786982476,
"kl": 0.00171661376953125,
"learning_rate": 7.5e-07,
"loss": -0.0032,
"reward": 2.28125,
"reward_std": 0.9946783781051636,
"rewards/accuracy_reward_staging": 0.09375,
"rewards/format_reward": 0.84375,
"rewards/format_reward_staging": 0.96875,
"step": 15
},
{
"clip_ratio": 0.0,
"completion_length": 555.625,
"epoch": 3.888888888888889,
"grad_norm": 1.7981216304584955,
"kl": 0.003185272216796875,
"learning_rate": 8e-07,
"loss": 0.0022,
"reward": 4.09375,
"reward_std": 2.7086294293403625,
"rewards/accuracy_reward_staging": 0.4375,
"rewards/format_reward": 0.9375,
"rewards/format_reward_staging": 0.96875,
"step": 16
},
{
"clip_ratio": 0.0,
"completion_length": 581.375,
"epoch": 4.222222222222222,
"grad_norm": 1.8924801483136653,
"kl": 0.003849029541015625,
"learning_rate": 8.499999999999999e-07,
"loss": 0.0192,
"reward": 2.8125,
"reward_std": 1.4357599020004272,
"rewards/accuracy_reward_staging": 0.1875,
"rewards/format_reward": 0.90625,
"rewards/format_reward_staging": 0.96875,
"step": 17
},
{
"clip_ratio": 0.0,
"completion_length": 626.875,
"epoch": 4.444444444444445,
"grad_norm": 1.4237753323985947,
"kl": 0.004940032958984375,
"learning_rate": 9e-07,
"loss": 0.0048,
"reward": 2.78125,
"reward_std": 1.6875,
"rewards/accuracy_reward_staging": 0.1875,
"rewards/format_reward": 0.875,
"rewards/format_reward_staging": 0.96875,
"step": 18
},
{
"clip_ratio": 0.0,
"completion_length": 583.875,
"epoch": 4.666666666666667,
"grad_norm": 1.4401282377616447,
"kl": 0.00505828857421875,
"learning_rate": 9.499999999999999e-07,
"loss": 0.0016,
"reward": 3.4375,
"reward_std": 2.3147872537374496,
"rewards/accuracy_reward_staging": 0.34375,
"rewards/format_reward": 0.8125,
"rewards/format_reward_staging": 0.90625,
"step": 19
},
{
"clip_ratio": 0.0,
"completion_length": 562.8125,
"epoch": 4.888888888888889,
"grad_norm": 1.1629869227175655,
"kl": 0.00585174560546875,
"learning_rate": 1e-06,
"loss": 0.0,
"reward": 2.5625,
"reward_std": 0.9797460436820984,
"rewards/accuracy_reward_staging": 0.125,
"rewards/format_reward": 0.9375,
"rewards/format_reward_staging": 1.0,
"step": 20
},
{
"clip_ratio": 0.0,
"completion_length": 614.875,
"epoch": 5.222222222222222,
"grad_norm": 1.6115188653051613,
"kl": 0.00612640380859375,
"learning_rate": 9.999829128320873e-07,
"loss": 0.0565,
"reward": 3.28125,
"reward_std": 2.4976893961429596,
"rewards/accuracy_reward_staging": 0.28125,
"rewards/format_reward": 0.90625,
"rewards/format_reward_staging": 0.96875,
"step": 21
},
{
"clip_ratio": 0.0,
"completion_length": 559.46875,
"epoch": 5.444444444444445,
"grad_norm": 1.465512353981508,
"kl": 0.00824737548828125,
"learning_rate": 9.999316524962345e-07,
"loss": 0.0541,
"reward": 3.3125,
"reward_std": 1.8101893961429596,
"rewards/accuracy_reward_staging": 0.28125,
"rewards/format_reward": 0.9375,
"rewards/format_reward_staging": 0.96875,
"step": 22
},
{
"clip_ratio": 0.0,
"completion_length": 572.59375,
"epoch": 5.666666666666667,
"grad_norm": 1.5847579776558225,
"kl": 0.0093841552734375,
"learning_rate": 9.998462224960173e-07,
"loss": 0.06,
"reward": 3.6875,
"reward_std": 2.443375587463379,
"rewards/accuracy_reward_staging": 0.34375,
"rewards/format_reward": 0.96875,
"rewards/format_reward_staging": 1.0,
"step": 23
},
{
"clip_ratio": 0.0,
"completion_length": 645.9375,
"epoch": 5.888888888888889,
"grad_norm": 1.8362203993654154,
"kl": 0.00734710693359375,
"learning_rate": 9.99726628670463e-07,
"loss": 0.0368,
"reward": 3.03125,
"reward_std": 2.283504918217659,
"rewards/accuracy_reward_staging": 0.28125,
"rewards/format_reward": 0.71875,
"rewards/format_reward_staging": 0.90625,
"step": 24
},
{
"clip_ratio": 0.0,
"completion_length": 572.5,
"epoch": 6.222222222222222,
"grad_norm": 1.6415108932304052,
"kl": 0.0096588134765625,
"learning_rate": 9.995728791936505e-07,
"loss": 0.0267,
"reward": 2.96875,
"reward_std": 1.7760016024112701,
"rewards/accuracy_reward_staging": 0.21875,
"rewards/format_reward": 0.90625,
"rewards/format_reward_staging": 0.96875,
"step": 25
},
{
"clip_ratio": 0.0,
"completion_length": 579.96875,
"epoch": 6.444444444444445,
"grad_norm": 1.4689069714869325,
"kl": 0.010345458984375,
"learning_rate": 9.993849845741523e-07,
"loss": 0.1034,
"reward": 2.5625,
"reward_std": 1.1108438968658447,
"rewards/accuracy_reward_staging": 0.15625,
"rewards/format_reward": 0.875,
"rewards/format_reward_staging": 0.90625,
"step": 26
},
{
"clip_ratio": 0.0,
"completion_length": 542.46875,
"epoch": 6.666666666666667,
"grad_norm": 1.7253968854719324,
"kl": 0.01122283935546875,
"learning_rate": 9.991629576543163e-07,
"loss": -0.0129,
"reward": 2.625,
"reward_std": 1.316565990447998,
"rewards/accuracy_reward_staging": 0.1875,
"rewards/format_reward": 0.78125,
"rewards/format_reward_staging": 0.90625,
"step": 27
},
{
"clip_ratio": 0.0,
"completion_length": 598.0,
"epoch": 6.888888888888889,
"grad_norm": 1.439672104037944,
"kl": 0.0132293701171875,
"learning_rate": 9.989068136093872e-07,
"loss": 0.0324,
"reward": 3.375,
"reward_std": 2.423195868730545,
"rewards/accuracy_reward_staging": 0.3125,
"rewards/format_reward": 0.875,
"rewards/format_reward_staging": 0.9375,
"step": 28
},
{
"clip_ratio": 0.0,
"completion_length": 557.5625,
"epoch": 7.222222222222222,
"grad_norm": 1.53093980357088,
"kl": 0.0146942138671875,
"learning_rate": 9.986165699464705e-07,
"loss": -0.0074,
"reward": 3.125,
"reward_std": 2.0308370888233185,
"rewards/accuracy_reward_staging": 0.25,
"rewards/format_reward": 0.90625,
"rewards/format_reward_staging": 0.96875,
"step": 29
},
{
"clip_ratio": 0.0,
"completion_length": 574.90625,
"epoch": 7.444444444444445,
"grad_norm": 1.0715134693817079,
"kl": 0.0147857666015625,
"learning_rate": 9.982922465033348e-07,
"loss": -0.0166,
"reward": 2.5,
"reward_std": 0.9858438968658447,
"rewards/accuracy_reward_staging": 0.125,
"rewards/format_reward": 0.875,
"rewards/format_reward_staging": 1.0,
"step": 30
},
{
"clip_ratio": 0.0,
"completion_length": 602.65625,
"epoch": 7.666666666666667,
"grad_norm": 1.4389686833903352,
"kl": 0.01611328125,
"learning_rate": 9.979338654470567e-07,
"loss": 0.0875,
"reward": 2.4375,
"reward_std": 1.2930222898721695,
"rewards/accuracy_reward_staging": 0.125,
"rewards/format_reward": 0.875,
"rewards/format_reward_staging": 0.9375,
"step": 31
},
{
"clip_ratio": 0.0,
"completion_length": 669.5625,
"epoch": 7.888888888888889,
"grad_norm": 1.0489321524468773,
"kl": 0.01910400390625,
"learning_rate": 9.975414512725056e-07,
"loss": 0.0185,
"reward": 2.5625,
"reward_std": 1.037847101688385,
"rewards/accuracy_reward_staging": 0.125,
"rewards/format_reward": 0.96875,
"rewards/format_reward_staging": 0.96875,
"step": 32
},
{
"clip_ratio": 0.0,
"completion_length": 532.625,
"epoch": 8.222222222222221,
"grad_norm": 1.5157291140048736,
"kl": 0.01885986328125,
"learning_rate": 9.971150308006687e-07,
"loss": -0.0001,
"reward": 4.125,
"reward_std": 2.000675529241562,
"rewards/accuracy_reward_staging": 0.4375,
"rewards/format_reward": 0.9375,
"rewards/format_reward_staging": 1.0,
"step": 33
},
{
"clip_ratio": 0.0,
"completion_length": 591.03125,
"epoch": 8.444444444444445,
"grad_norm": 1.5963578785319679,
"kl": 0.0192413330078125,
"learning_rate": 9.966546331768192e-07,
"loss": 0.1269,
"reward": 2.875,
"reward_std": 2.112294152379036,
"rewards/accuracy_reward_staging": 0.25,
"rewards/format_reward": 0.78125,
"rewards/format_reward_staging": 0.84375,
"step": 34
},
{
"clip_ratio": 0.0,
"completion_length": 603.09375,
"epoch": 8.666666666666666,
"grad_norm": 1.4508455813252856,
"kl": 0.01494598388671875,
"learning_rate": 9.961602898685223e-07,
"loss": 0.0585,
"reward": 3.3125,
"reward_std": 2.0126227736473083,
"rewards/accuracy_reward_staging": 0.3125,
"rewards/format_reward": 0.8125,
"rewards/format_reward_staging": 0.9375,
"step": 35
},
{
"clip_ratio": 0.0,
"completion_length": 579.9375,
"epoch": 8.88888888888889,
"grad_norm": 1.196537394176258,
"kl": 0.0169830322265625,
"learning_rate": 9.956320346634875e-07,
"loss": 0.0166,
"reward": 2.78125,
"reward_std": 1.3710740953683853,
"rewards/accuracy_reward_staging": 0.1875,
"rewards/format_reward": 0.84375,
"rewards/format_reward_staging": 1.0,
"step": 36
},
{
"clip_ratio": 0.0,
"completion_length": 572.96875,
"epoch": 9.222222222222221,
"grad_norm": 1.4031846103728705,
"kl": 0.0164794921875,
"learning_rate": 9.95069903667256e-07,
"loss": 0.0257,
"reward": 2.65625,
"reward_std": 1.4369846880435944,
"rewards/accuracy_reward_staging": 0.1875,
"rewards/format_reward": 0.8125,
"rewards/format_reward_staging": 0.90625,
"step": 37
},
{
"clip_ratio": 0.0,
"completion_length": 533.8125,
"epoch": 9.444444444444445,
"grad_norm": 1.7378697171564481,
"kl": 0.019744873046875,
"learning_rate": 9.944739353007341e-07,
"loss": 0.0651,
"reward": 3.6875,
"reward_std": 2.841255784034729,
"rewards/accuracy_reward_staging": 0.375,
"rewards/format_reward": 0.84375,
"rewards/format_reward_staging": 0.96875,
"step": 38
},
{
"clip_ratio": 0.0,
"completion_length": 575.5625,
"epoch": 9.666666666666666,
"grad_norm": 1.6742496883549038,
"kl": 0.018218994140625,
"learning_rate": 9.938441702975689e-07,
"loss": 0.0249,
"reward": 2.4375,
"reward_std": 1.2126952707767487,
"rewards/accuracy_reward_staging": 0.125,
"rewards/format_reward": 0.84375,
"rewards/format_reward_staging": 0.96875,
"step": 39
},
{
"clip_ratio": 0.0,
"completion_length": 541.375,
"epoch": 9.88888888888889,
"grad_norm": 1.6853780037379804,
"kl": 0.0196533203125,
"learning_rate": 9.931806517013612e-07,
"loss": 0.0121,
"reward": 2.5625,
"reward_std": 1.3815238624811172,
"rewards/accuracy_reward_staging": 0.15625,
"rewards/format_reward": 0.84375,
"rewards/format_reward_staging": 0.9375,
"step": 40
},
{
"clip_ratio": 0.0,
"completion_length": 550.90625,
"epoch": 10.222222222222221,
"grad_norm": 1.2047759950332129,
"kl": 0.017730712890625,
"learning_rate": 9.924834248627258e-07,
"loss": 0.0398,
"reward": 2.8125,
"reward_std": 1.4487498700618744,
"rewards/accuracy_reward_staging": 0.1875,
"rewards/format_reward": 0.90625,
"rewards/format_reward_staging": 0.96875,
"step": 41
},
{
"clip_ratio": 0.0,
"completion_length": 587.34375,
"epoch": 10.444444444444445,
"grad_norm": 2.2662890327219642,
"kl": 0.032135009765625,
"learning_rate": 9.917525374361911e-07,
"loss": 0.0402,
"reward": 3.375,
"reward_std": 2.6460810601711273,
"rewards/accuracy_reward_staging": 0.3125,
"rewards/format_reward": 0.84375,
"rewards/format_reward_staging": 0.96875,
"step": 42
},
{
"clip_ratio": 0.0,
"completion_length": 552.4375,
"epoch": 10.666666666666666,
"grad_norm": 0.8485843884389722,
"kl": 0.021148681640625,
"learning_rate": 9.909880393769418e-07,
"loss": 0.0349,
"reward": 2.5,
"reward_std": 1.045437604188919,
"rewards/accuracy_reward_staging": 0.125,
"rewards/format_reward": 0.9375,
"rewards/format_reward_staging": 0.9375,
"step": 43
},
{
"clip_ratio": 0.0,
"completion_length": 626.625,
"epoch": 10.88888888888889,
"grad_norm": 1.4242611362455049,
"kl": 0.018280029296875,
"learning_rate": 9.901899829374047e-07,
"loss": 0.0405,
"reward": 3.03125,
"reward_std": 2.107846677303314,
"rewards/accuracy_reward_staging": 0.25,
"rewards/format_reward": 0.875,
"rewards/format_reward_staging": 0.90625,
"step": 44
},
{
"clip_ratio": 0.0,
"completion_length": 536.8125,
"epoch": 11.222222222222221,
"grad_norm": 1.6000184113652984,
"kl": 0.025238037109375,
"learning_rate": 9.893584226636772e-07,
"loss": -0.0471,
"reward": 2.78125,
"reward_std": 1.6772827804088593,
"rewards/accuracy_reward_staging": 0.1875,
"rewards/format_reward": 0.84375,
"rewards/format_reward_staging": 1.0,
"step": 45
},
{
"clip_ratio": 0.0,
"completion_length": 588.65625,
"epoch": 11.444444444444445,
"grad_norm": 1.2633801476740014,
"kl": 0.02093505859375,
"learning_rate": 9.884934153917996e-07,
"loss": 0.027,
"reward": 2.4375,
"reward_std": 1.226884126663208,
"rewards/accuracy_reward_staging": 0.125,
"rewards/format_reward": 0.875,
"rewards/format_reward_staging": 0.9375,
"step": 46
},
{
"clip_ratio": 0.0,
"completion_length": 524.1875,
"epoch": 11.666666666666666,
"grad_norm": 1.7228504370636915,
"kl": 0.020599365234375,
"learning_rate": 9.8759502024387e-07,
"loss": -0.0016,
"reward": 3.125,
"reward_std": 1.8041669130325317,
"rewards/accuracy_reward_staging": 0.28125,
"rewards/format_reward": 0.8125,
"rewards/format_reward_staging": 0.90625,
"step": 47
},
{
"clip_ratio": 0.0,
"completion_length": 572.875,
"epoch": 11.88888888888889,
"grad_norm": 7.859881611636793,
"kl": 0.063079833984375,
"learning_rate": 9.866632986240029e-07,
"loss": 0.0482,
"reward": 3.25,
"reward_std": 2.0755133628845215,
"rewards/accuracy_reward_staging": 0.28125,
"rewards/format_reward": 0.875,
"rewards/format_reward_staging": 0.96875,
"step": 48
},
{
"clip_ratio": 0.0,
"completion_length": 577.5625,
"epoch": 12.222222222222221,
"grad_norm": 1.7851397304147796,
"kl": 0.0205078125,
"learning_rate": 9.856983142141337e-07,
"loss": 0.0509,
"reward": 3.3125,
"reward_std": 2.14286145567894,
"rewards/accuracy_reward_staging": 0.3125,
"rewards/format_reward": 0.8125,
"rewards/format_reward_staging": 0.9375,
"step": 49
},
{
"epoch": 12.444444444444445,
"grad_norm": 1.585137209096838,
"learning_rate": 9.847001329696652e-07,
"loss": -0.0125,
"step": 50
},
{
"epoch": 12.444444444444445,
"eval_clip_ratio": 0.0,
"eval_completion_length": 597.925,
"eval_kl": 0.02578125,
"eval_loss": 0.024221811443567276,
"eval_reward": 2.625,
"eval_reward_std": 1.6041045665740967,
"eval_rewards/accuracy_reward_staging": 0.175,
"eval_rewards/format_reward": 0.8,
"eval_rewards/format_reward_staging": 0.95,
"eval_runtime": 51.776,
"eval_samples_per_second": 0.695,
"eval_steps_per_second": 0.097,
"step": 50
},
{
"clip_ratio": 0.0,
"completion_length": 567.046875,
"epoch": 12.666666666666666,
"grad_norm": 1.6497404550782266,
"kl": 0.020294189453125,
"learning_rate": 9.836688231149592e-07,
"loss": -0.0235,
"reward": 3.328125,
"reward_std": 2.148952841758728,
"rewards/accuracy_reward_staging": 0.296875,
"rewards/format_reward": 0.890625,
"rewards/format_reward_staging": 0.953125,
"step": 51
},
{
"clip_ratio": 0.0,
"completion_length": 655.3125,
"epoch": 12.88888888888889,
"grad_norm": 1.0110588489868237,
"kl": 0.018829345703125,
"learning_rate": 9.826044551386742e-07,
"loss": -0.0207,
"reward": 2.5625,
"reward_std": 1.046603798866272,
"rewards/accuracy_reward_staging": 0.125,
"rewards/format_reward": 0.96875,
"rewards/format_reward_staging": 0.96875,
"step": 52
},
{
"clip_ratio": 0.0,
"completion_length": 571.65625,
"epoch": 13.222222222222221,
"grad_norm": 1.5942717910970237,
"kl": 0.0233154296875,
"learning_rate": 9.81507101788948e-07,
"loss": 0.0327,
"reward": 2.96875,
"reward_std": 2.0815286338329315,
"rewards/accuracy_reward_staging": 0.21875,
"rewards/format_reward": 0.875,
"rewards/format_reward_staging": 1.0,
"step": 53
},
{
"clip_ratio": 0.0,
"completion_length": 531.0,
"epoch": 13.444444444444445,
"grad_norm": 1.6431487531106521,
"kl": 0.02325439453125,
"learning_rate": 9.803768380684242e-07,
"loss": -0.005,
"reward": 3.1875,
"reward_std": 2.4305797815322876,
"rewards/accuracy_reward_staging": 0.28125,
"rewards/format_reward": 0.875,
"rewards/format_reward_staging": 0.90625,
"step": 54
},
{
"clip_ratio": 0.0,
"completion_length": 530.71875,
"epoch": 13.666666666666666,
"grad_norm": 1.3532727186337274,
"kl": 0.021026611328125,
"learning_rate": 9.792137412291263e-07,
"loss": -0.0091,
"reward": 3.09375,
"reward_std": 1.5625,
"rewards/accuracy_reward_staging": 0.21875,
"rewards/format_reward": 1.0,
"rewards/format_reward_staging": 1.0,
"step": 55
},
{
"clip_ratio": 0.0,
"completion_length": 573.375,
"epoch": 13.88888888888889,
"grad_norm": 1.4441812945667367,
"kl": 0.024932861328125,
"learning_rate": 9.780178907671788e-07,
"loss": 0.0275,
"reward": 3.34375,
"reward_std": 2.1209341287612915,
"rewards/accuracy_reward_staging": 0.3125,
"rewards/format_reward": 0.84375,
"rewards/format_reward_staging": 0.9375,
"step": 56
},
{
"clip_ratio": 0.0,
"completion_length": 520.15625,
"epoch": 14.222222222222221,
"grad_norm": 1.6824005469371979,
"kl": 0.026092529296875,
"learning_rate": 9.76789368417372e-07,
"loss": -0.0531,
"reward": 2.8125,
"reward_std": 1.377088338136673,
"rewards/accuracy_reward_staging": 0.1875,
"rewards/format_reward": 0.875,
"rewards/format_reward_staging": 1.0,
"step": 57
},
{
"clip_ratio": 0.0,
"completion_length": 599.96875,
"epoch": 14.444444444444445,
"grad_norm": 1.4915574785365073,
"kl": 0.021026611328125,
"learning_rate": 9.755282581475767e-07,
"loss": 0.0364,
"reward": 4.9375,
"reward_std": 2.745547831058502,
"rewards/accuracy_reward_staging": 0.59375,
"rewards/format_reward": 0.96875,
"rewards/format_reward_staging": 1.0,
"step": 58
},
{
"clip_ratio": 0.0,
"completion_length": 549.71875,
"epoch": 14.666666666666666,
"grad_norm": 1.4821961551515155,
"kl": 0.02593994140625,
"learning_rate": 9.742346461530047e-07,
"loss": 0.0872,
"reward": 2.53125,
"reward_std": 1.4375,
"rewards/accuracy_reward_staging": 0.125,
"rewards/format_reward": 0.9375,
"rewards/format_reward_staging": 0.96875,
"step": 59
},
{
"clip_ratio": 0.0,
"completion_length": 601.375,
"epoch": 14.88888888888889,
"grad_norm": 1.249530356824017,
"kl": 0.023406982421875,
"learning_rate": 9.729086208503173e-07,
"loss": 0.0652,
"reward": 2.4375,
"reward_std": 1.1680222749710083,
"rewards/accuracy_reward_staging": 0.125,
"rewards/format_reward": 0.875,
"rewards/format_reward_staging": 0.9375,
"step": 60
},
{
"clip_ratio": 0.0,
"completion_length": 613.71875,
"epoch": 15.222222222222221,
"grad_norm": 1.4621397072761817,
"kl": 0.0252685546875,
"learning_rate": 9.715502728715825e-07,
"loss": 0.0108,
"reward": 2.96875,
"reward_std": 1.8319481909275055,
"rewards/accuracy_reward_staging": 0.21875,
"rewards/format_reward": 0.90625,
"rewards/format_reward_staging": 0.96875,
"step": 61
},
{
"clip_ratio": 0.0,
"completion_length": 621.71875,
"epoch": 15.444444444444445,
"grad_norm": 1.4960047973343167,
"kl": 0.023590087890625,
"learning_rate": 9.701596950580807e-07,
"loss": -0.008,
"reward": 3.21875,
"reward_std": 2.3255662322044373,
"rewards/accuracy_reward_staging": 0.28125,
"rewards/format_reward": 0.875,
"rewards/format_reward_staging": 0.9375,
"step": 62
},
{
"clip_ratio": 0.0,
"completion_length": 559.0,
"epoch": 15.666666666666666,
"grad_norm": 1.377229747116843,
"kl": 0.031097412109375,
"learning_rate": 9.687369824539576e-07,
"loss": 0.072,
"reward": 2.9375,
"reward_std": 1.7239685356616974,
"rewards/accuracy_reward_staging": 0.21875,
"rewards/format_reward": 0.875,
"rewards/format_reward_staging": 0.96875,
"step": 63
},
{
"clip_ratio": 0.0,
"completion_length": 542.125,
"epoch": 15.88888888888889,
"grad_norm": 1.3837348765591453,
"kl": 0.034423828125,
"learning_rate": 9.672822322997304e-07,
"loss": 0.0508,
"reward": 2.28125,
"reward_std": 1.1752630770206451,
"rewards/accuracy_reward_staging": 0.09375,
"rewards/format_reward": 0.90625,
"rewards/format_reward_staging": 0.90625,
"step": 64
},
{
"clip_ratio": 0.0,
"completion_length": 645.625,
"epoch": 16.22222222222222,
"grad_norm": 1.2294440183285422,
"kl": 0.023651123046875,
"learning_rate": 9.657955440256395e-07,
"loss": -0.0012,
"reward": 2.59375,
"reward_std": 1.0483438968658447,
"rewards/accuracy_reward_staging": 0.125,
"rewards/format_reward": 0.96875,
"rewards/format_reward_staging": 1.0,
"step": 65
},
{
"clip_ratio": 0.0,
"completion_length": 593.46875,
"epoch": 16.444444444444443,
"grad_norm": 1.5888259552277046,
"kl": 0.02777099609375,
"learning_rate": 9.642770192448535e-07,
"loss": 0.0496,
"reward": 3.71875,
"reward_std": 2.2672154307365417,
"rewards/accuracy_reward_staging": 0.40625,
"rewards/format_reward": 0.71875,
"rewards/format_reward_staging": 0.96875,
"step": 66
},
{
"clip_ratio": 0.0,
"completion_length": 574.875,
"epoch": 16.666666666666668,
"grad_norm": 1.5785381612535059,
"kl": 0.034942626953125,
"learning_rate": 9.627267617465243e-07,
"loss": -0.0426,
"reward": 3.03125,
"reward_std": 1.496883064508438,
"rewards/accuracy_reward_staging": 0.21875,
"rewards/format_reward": 0.9375,
"rewards/format_reward_staging": 1.0,
"step": 67
},
{
"clip_ratio": 0.0,
"completion_length": 566.4375,
"epoch": 16.88888888888889,
"grad_norm": 1.5972037559178247,
"kl": 0.026702880859375,
"learning_rate": 9.611448774886923e-07,
"loss": 0.005,
"reward": 3.15625,
"reward_std": 1.9091877937316895,
"rewards/accuracy_reward_staging": 0.25,
"rewards/format_reward": 0.9375,
"rewards/format_reward_staging": 0.96875,
"step": 68
},
{
"clip_ratio": 0.0,
"completion_length": 624.21875,
"epoch": 17.22222222222222,
"grad_norm": 2.781118760006495,
"kl": 0.042724609375,
"learning_rate": 9.595314745910455e-07,
"loss": 0.0926,
"reward": 3.3125,
"reward_std": 2.4584514498710632,
"rewards/accuracy_reward_staging": 0.3125,
"rewards/format_reward": 0.8125,
"rewards/format_reward_staging": 0.9375,
"step": 69
},
{
"clip_ratio": 0.0,
"completion_length": 545.65625,
"epoch": 17.444444444444443,
"grad_norm": 1.6678524207695304,
"kl": 0.028411865234375,
"learning_rate": 9.578866633275286e-07,
"loss": 0.0606,
"reward": 3.75,
"reward_std": 2.237764596939087,
"rewards/accuracy_reward_staging": 0.375,
"rewards/format_reward": 0.9375,
"rewards/format_reward_staging": 0.9375,
"step": 70
},
{
"clip_ratio": 0.0,
"completion_length": 524.0,
"epoch": 17.666666666666668,
"grad_norm": 1.4163449995088322,
"kl": 0.032470703125,
"learning_rate": 9.562105561188068e-07,
"loss": 0.0105,
"reward": 3.40625,
"reward_std": 1.9233438968658447,
"rewards/accuracy_reward_staging": 0.28125,
"rewards/format_reward": 1.0,
"rewards/format_reward_staging": 1.0,
"step": 71
},
{
"clip_ratio": 0.0,
"completion_length": 619.8125,
"epoch": 17.88888888888889,
"grad_norm": 1.330864720677356,
"kl": 0.02783203125,
"learning_rate": 9.545032675245813e-07,
"loss": 0.0232,
"reward": 2.875,
"reward_std": 1.5208123177289963,
"rewards/accuracy_reward_staging": 0.21875,
"rewards/format_reward": 0.875,
"rewards/format_reward_staging": 0.90625,
"step": 72
},
{
"clip_ratio": 0.0,
"completion_length": 615.5625,
"epoch": 18.22222222222222,
"grad_norm": 1.3940735985441313,
"kl": 0.0289306640625,
"learning_rate": 9.527649142357594e-07,
"loss": 0.0449,
"reward": 4.8125,
"reward_std": 3.365248918533325,
"rewards/accuracy_reward_staging": 0.59375,
"rewards/format_reward": 0.90625,
"rewards/format_reward_staging": 0.9375,
"step": 73
},
{
"clip_ratio": 0.0,
"completion_length": 560.6875,
"epoch": 18.444444444444443,
"grad_norm": 1.573293312721447,
"kl": 0.031890869140625,
"learning_rate": 9.509956150664795e-07,
"loss": 0.0727,
"reward": 2.40625,
"reward_std": 1.0983919501304626,
"rewards/accuracy_reward_staging": 0.125,
"rewards/format_reward": 0.78125,
"rewards/format_reward_staging": 1.0,
"step": 74
},
{
"clip_ratio": 0.0,
"completion_length": 597.71875,
"epoch": 18.666666666666668,
"grad_norm": 1.3288952801951834,
"kl": 0.028411865234375,
"learning_rate": 9.491954909459894e-07,
"loss": 0.0299,
"reward": 4.125,
"reward_std": 2.0565126538276672,
"rewards/accuracy_reward_staging": 0.4375,
"rewards/format_reward": 0.96875,
"rewards/format_reward_staging": 0.96875,
"step": 75
},
{
"clip_ratio": 0.0,
"completion_length": 566.875,
"epoch": 18.88888888888889,
"grad_norm": 1.600349042443185,
"kl": 0.03497314453125,
"learning_rate": 9.473646649103817e-07,
"loss": 0.0048,
"reward": 3.46875,
"reward_std": 2.3291621804237366,
"rewards/accuracy_reward_staging": 0.3125,
"rewards/format_reward": 0.9375,
"rewards/format_reward_staging": 0.96875,
"step": 76
},
{
"clip_ratio": 0.0,
"completion_length": 527.15625,
"epoch": 19.22222222222222,
"grad_norm": 2.0117307258354242,
"kl": 0.034820556640625,
"learning_rate": 9.455032620941839e-07,
"loss": 0.0076,
"reward": 3.15625,
"reward_std": 1.690910965204239,
"rewards/accuracy_reward_staging": 0.25,
"rewards/format_reward": 0.9375,
"rewards/format_reward_staging": 0.96875,
"step": 77
},
{
"clip_ratio": 0.0,
"completion_length": 548.125,
"epoch": 19.444444444444443,
"grad_norm": 1.2154400614249532,
"kl": 0.034393310546875,
"learning_rate": 9.436114097218058e-07,
"loss": 0.0153,
"reward": 2.34375,
"reward_std": 0.9375,
"rewards/accuracy_reward_staging": 0.09375,
"rewards/format_reward": 0.90625,
"rewards/format_reward_staging": 0.96875,
"step": 78
},
{
"clip_ratio": 0.0,
"completion_length": 599.9375,
"epoch": 19.666666666666668,
"grad_norm": 1.6406138056170174,
"kl": 0.029205322265625,
"learning_rate": 9.416892370988442e-07,
"loss": 0.0752,
"reward": 2.75,
"reward_std": 1.9128470420837402,
"rewards/accuracy_reward_staging": 0.1875,
"rewards/format_reward": 0.875,
"rewards/format_reward_staging": 0.9375,
"step": 79
},
{
"clip_ratio": 0.0,
"completion_length": 553.6875,
"epoch": 19.88888888888889,
"grad_norm": 1.527942838909739,
"kl": 0.030364990234375,
"learning_rate": 9.397368756032444e-07,
"loss": -0.0126,
"reward": 4.34375,
"reward_std": 3.079783648252487,
"rewards/accuracy_reward_staging": 0.5,
"rewards/format_reward": 0.875,
"rewards/format_reward_staging": 0.96875,
"step": 80
},
{
"clip_ratio": 0.0,
"completion_length": 561.53125,
"epoch": 20.22222222222222,
"grad_norm": 1.6342025185675375,
"kl": 0.032318115234375,
"learning_rate": 9.377544586763214e-07,
"loss": -0.0331,
"reward": 4.0625,
"reward_std": 2.1620407104492188,
"rewards/accuracy_reward_staging": 0.4375,
"rewards/format_reward": 0.90625,
"rewards/format_reward_staging": 0.96875,
"step": 81
},
{
"clip_ratio": 0.0,
"completion_length": 608.125,
"epoch": 20.444444444444443,
"grad_norm": 0.9760974984694354,
"kl": 0.03082275390625,
"learning_rate": 9.357421218136386e-07,
"loss": -0.0281,
"reward": 2.90625,
"reward_std": 1.3726893961429596,
"rewards/accuracy_reward_staging": 0.1875,
"rewards/format_reward": 0.96875,
"rewards/format_reward_staging": 1.0,
"step": 82
},
{
"clip_ratio": 0.0,
"completion_length": 607.1875,
"epoch": 20.666666666666668,
"grad_norm": 2.9650567483991894,
"kl": 0.0552978515625,
"learning_rate": 9.337000025557476e-07,
"loss": 0.0494,
"reward": 2.78125,
"reward_std": 1.907078742980957,
"rewards/accuracy_reward_staging": 0.21875,
"rewards/format_reward": 0.8125,
"rewards/format_reward_staging": 0.875,
"step": 83
},
{
"clip_ratio": 0.0,
"completion_length": 648.5625,
"epoch": 20.88888888888889,
"grad_norm": 1.4868637308996282,
"kl": 0.04937744140625,
"learning_rate": 9.316282404787869e-07,
"loss": 0.0813,
"reward": 2.4375,
"reward_std": 1.534547746181488,
"rewards/accuracy_reward_staging": 0.15625,
"rewards/format_reward": 0.78125,
"rewards/format_reward_staging": 0.875,
"step": 84
},
{
"clip_ratio": 0.0,
"completion_length": 598.0,
"epoch": 21.22222222222222,
"grad_norm": 1.5354307812124717,
"kl": 0.03326416015625,
"learning_rate": 9.295269771849425e-07,
"loss": 0.1102,
"reward": 3.53125,
"reward_std": 2.2993226647377014,
"rewards/accuracy_reward_staging": 0.34375,
"rewards/format_reward": 0.875,
"rewards/format_reward_staging": 0.9375,
"step": 85
},
{
"clip_ratio": 0.0,
"completion_length": 585.25,
"epoch": 21.444444444444443,
"grad_norm": 1.21306534102283,
"kl": 0.03662109375,
"learning_rate": 9.273963562927694e-07,
"loss": 0.0034,
"reward": 2.90625,
"reward_std": 1.0483438968658447,
"rewards/accuracy_reward_staging": 0.1875,
"rewards/format_reward": 0.96875,
"rewards/format_reward_staging": 1.0,
"step": 86
},
{
"clip_ratio": 0.0,
"completion_length": 568.71875,
"epoch": 21.666666666666668,
"grad_norm": 11.152096799903676,
"kl": 0.09722900390625,
"learning_rate": 9.252365234273753e-07,
"loss": 0.0125,
"reward": 3.1875,
"reward_std": 1.9283326417207718,
"rewards/accuracy_reward_staging": 0.28125,
"rewards/format_reward": 0.8125,
"rewards/format_reward_staging": 0.96875,
"step": 87
},
{
"clip_ratio": 0.0,
"completion_length": 567.0625,
"epoch": 21.88888888888889,
"grad_norm": 1.4820021533223564,
"kl": 0.04046630859375,
"learning_rate": 9.230476262104676e-07,
"loss": 0.0631,
"reward": 3.40625,
"reward_std": 2.233847141265869,
"rewards/accuracy_reward_staging": 0.3125,
"rewards/format_reward": 0.84375,
"rewards/format_reward_staging": 1.0,
"step": 88
},
{
"clip_ratio": 0.0,
"completion_length": 571.625,
"epoch": 22.22222222222222,
"grad_norm": 1.6685149630374954,
"kl": 0.04840087890625,
"learning_rate": 9.208298142502635e-07,
"loss": 0.057,
"reward": 2.90625,
"reward_std": 1.7658206820487976,
"rewards/accuracy_reward_staging": 0.21875,
"rewards/format_reward": 0.84375,
"rewards/format_reward_staging": 0.96875,
"step": 89
},
{
"clip_ratio": 0.0,
"completion_length": 652.0,
"epoch": 22.444444444444443,
"grad_norm": 1.2598669750057847,
"kl": 0.037872314453125,
"learning_rate": 9.185832391312642e-07,
"loss": 0.0397,
"reward": 2.1875,
"reward_std": 1.0936830341815948,
"rewards/accuracy_reward_staging": 0.09375,
"rewards/format_reward": 0.875,
"rewards/format_reward_staging": 0.84375,
"step": 90
},
{
"clip_ratio": 0.0,
"completion_length": 512.84375,
"epoch": 22.666666666666668,
"grad_norm": 1.464116156191612,
"kl": 0.0408935546875,
"learning_rate": 9.163080544038952e-07,
"loss": 0.0325,
"reward": 3.1875,
"reward_std": 2.0054054260253906,
"rewards/accuracy_reward_staging": 0.28125,
"rewards/format_reward": 0.84375,
"rewards/format_reward_staging": 0.9375,
"step": 91
},
{
"clip_ratio": 0.0,
"completion_length": 595.0,
"epoch": 22.88888888888889,
"grad_norm": 1.7502771652549964,
"kl": 0.0543212890625,
"learning_rate": 9.1400441557401e-07,
"loss": 0.1198,
"reward": 4.375,
"reward_std": 2.6551371216773987,
"rewards/accuracy_reward_staging": 0.53125,
"rewards/format_reward": 0.8125,
"rewards/format_reward_staging": 0.90625,
"step": 92
},
{
"clip_ratio": 0.0,
"completion_length": 575.375,
"epoch": 23.22222222222222,
"grad_norm": 1.5494132503619473,
"kl": 0.04376220703125,
"learning_rate": 9.116724800922629e-07,
"loss": 0.1098,
"reward": 3.6875,
"reward_std": 1.9493454694747925,
"rewards/accuracy_reward_staging": 0.375,
"rewards/format_reward": 0.875,
"rewards/format_reward_staging": 0.9375,
"step": 93
},
{
"clip_ratio": 0.0,
"completion_length": 577.5,
"epoch": 23.444444444444443,
"grad_norm": 1.2511045169588764,
"kl": 0.0521240234375,
"learning_rate": 9.093124073433462e-07,
"loss": 0.0389,
"reward": 3.5625,
"reward_std": 2.1182020902633667,
"rewards/accuracy_reward_staging": 0.34375,
"rewards/format_reward": 0.84375,
"rewards/format_reward_staging": 1.0,
"step": 94
},
{
"clip_ratio": 0.0,
"completion_length": 590.78125,
"epoch": 23.666666666666668,
"grad_norm": 1.5974928179741261,
"kl": 0.045074462890625,
"learning_rate": 9.069243586350975e-07,
"loss": -0.0127,
"reward": 4.09375,
"reward_std": 2.1429253816604614,
"rewards/accuracy_reward_staging": 0.4375,
"rewards/format_reward": 0.90625,
"rewards/format_reward_staging": 1.0,
"step": 95
},
{
"clip_ratio": 0.0,
"completion_length": 454.1875,
"epoch": 23.88888888888889,
"grad_norm": 1.885519118261372,
"kl": 0.0450439453125,
"learning_rate": 9.045084971874737e-07,
"loss": 0.0469,
"reward": 4.0625,
"reward_std": 2.76924729347229,
"rewards/accuracy_reward_staging": 0.4375,
"rewards/format_reward": 0.90625,
"rewards/format_reward_staging": 0.96875,
"step": 96
},
{
"clip_ratio": 0.0,
"completion_length": 538.0625,
"epoch": 24.22222222222222,
"grad_norm": 1.5682355592026038,
"kl": 0.05267333984375,
"learning_rate": 9.020649881213958e-07,
"loss": 0.0061,
"reward": 3.40625,
"reward_std": 2.1967990398406982,
"rewards/accuracy_reward_staging": 0.3125,
"rewards/format_reward": 0.90625,
"rewards/format_reward_staging": 0.9375,
"step": 97
},
{
"clip_ratio": 0.0,
"completion_length": 632.75,
"epoch": 24.444444444444443,
"grad_norm": 1.2736403946455588,
"kl": 0.044189453125,
"learning_rate": 8.995939984474623e-07,
"loss": 0.0172,
"reward": 3.84375,
"reward_std": 2.4564297795295715,
"rewards/accuracy_reward_staging": 0.40625,
"rewards/format_reward": 0.8125,
"rewards/format_reward_staging": 1.0,
"step": 98
},
{
"clip_ratio": 0.0,
"completion_length": 582.90625,
"epoch": 24.666666666666668,
"grad_norm": 1.5123549273068009,
"kl": 0.04638671875,
"learning_rate": 8.970956970545355e-07,
"loss": 0.0662,
"reward": 3.78125,
"reward_std": 2.7111909985542297,
"rewards/accuracy_reward_staging": 0.375,
"rewards/format_reward": 0.9375,
"rewards/format_reward_staging": 0.96875,
"step": 99
},
{
"epoch": 24.88888888888889,
"grad_norm": 1.7849471498217702,
"learning_rate": 8.945702546981968e-07,
"loss": 0.142,
"step": 100
},
{
"epoch": 24.88888888888889,
"eval_clip_ratio": 0.0,
"eval_completion_length": 511.125,
"eval_kl": 0.073193359375,
"eval_loss": -0.007530718110501766,
"eval_reward": 2.075,
"eval_reward_std": 0.6665439963340759,
"eval_rewards/accuracy_reward_staging": 0.05,
"eval_rewards/format_reward": 0.85,
"eval_rewards/format_reward_staging": 0.975,
"eval_runtime": 50.3514,
"eval_samples_per_second": 0.715,
"eval_steps_per_second": 0.099,
"step": 100
},
{
"clip_ratio": 0.0,
"completion_length": 581.21875,
"epoch": 25.22222222222222,
"grad_norm": 1.8652012487452174,
"kl": 0.05792236328125,
"learning_rate": 8.920178439890764e-07,
"loss": 0.0112,
"reward": 3.46875,
"reward_std": 1.8295301795005798,
"rewards/accuracy_reward_staging": 0.328125,
"rewards/format_reward": 0.890625,
"rewards/format_reward_staging": 0.9375,
"step": 101
},
{
"clip_ratio": 0.0,
"completion_length": 597.875,
"epoch": 25.444444444444443,
"grad_norm": 4.84795988570716,
"kl": 0.06231689453125,
"learning_rate": 8.894386393810562e-07,
"loss": 0.0844,
"reward": 2.875,
"reward_std": 1.6470783054828644,
"rewards/accuracy_reward_staging": 0.21875,
"rewards/format_reward": 0.8125,
"rewards/format_reward_staging": 0.96875,
"step": 102
},
{
"clip_ratio": 0.0,
"completion_length": 562.5,
"epoch": 25.666666666666668,
"grad_norm": 1.8754521848828094,
"kl": 0.052978515625,
"learning_rate": 8.868328171593446e-07,
"loss": -0.0154,
"reward": 4.25,
"reward_std": 2.547704756259918,
"rewards/accuracy_reward_staging": 0.46875,
"rewards/format_reward": 0.90625,
"rewards/format_reward_staging": 1.0,
"step": 103
},
{
"clip_ratio": 0.0,
"completion_length": 537.3125,
"epoch": 25.88888888888889,
"grad_norm": 1.797756724546587,
"kl": 0.05206298828125,
"learning_rate": 8.842005554284295e-07,
"loss": -0.0275,
"reward": 3.84375,
"reward_std": 2.51630362868309,
"rewards/accuracy_reward_staging": 0.40625,
"rewards/format_reward": 0.90625,
"rewards/format_reward_staging": 0.90625,
"step": 104
},
{
"clip_ratio": 0.0,
"completion_length": 622.53125,
"epoch": 26.22222222222222,
"grad_norm": 1.300955660750681,
"kl": 0.05413818359375,
"learning_rate": 8.815420340999033e-07,
"loss": 0.0637,
"reward": 3.84375,
"reward_std": 1.3620327413082123,
"rewards/accuracy_reward_staging": 0.40625,
"rewards/format_reward": 0.875,
"rewards/format_reward_staging": 0.9375,
"step": 105
},
{
"clip_ratio": 0.0,
"completion_length": 594.25,
"epoch": 26.444444444444443,
"grad_norm": 1.43085940167237,
"kl": 0.0439453125,
"learning_rate": 8.788574348801674e-07,
"loss": 0.0768,
"reward": 4.625,
"reward_std": 1.9858438968658447,
"rewards/accuracy_reward_staging": 0.53125,
"rewards/format_reward": 0.96875,
"rewards/format_reward_staging": 1.0,
"step": 106
},
{
"clip_ratio": 0.0,
"completion_length": 582.15625,
"epoch": 26.666666666666668,
"grad_norm": 1.6135777138066925,
"kl": 0.06390380859375,
"learning_rate": 8.761469412580124e-07,
"loss": 0.0142,
"reward": 1.96875,
"reward_std": 1.00966876745224,
"rewards/accuracy_reward_staging": 0.0625,
"rewards/format_reward": 0.78125,
"rewards/format_reward_staging": 0.875,
"step": 107
},
{
"clip_ratio": 0.0,
"completion_length": 587.0625,
"epoch": 26.88888888888889,
"grad_norm": 2.1207393888337887,
"kl": 0.06134033203125,
"learning_rate": 8.734107384920769e-07,
"loss": 0.0242,
"reward": 4.125,
"reward_std": 3.0213340520858765,
"rewards/accuracy_reward_staging": 0.4375,
"rewards/format_reward": 0.9375,
"rewards/format_reward_staging": 1.0,
"step": 108
},
{
"clip_ratio": 0.0,
"completion_length": 548.40625,
"epoch": 27.22222222222222,
"grad_norm": 1.5591626897717148,
"kl": 0.0465087890625,
"learning_rate": 8.706490135981855e-07,
"loss": -0.0282,
"reward": 4.5625,
"reward_std": 2.360237419605255,
"rewards/accuracy_reward_staging": 0.53125,
"rewards/format_reward": 0.9375,
"rewards/format_reward_staging": 0.96875,
"step": 109
},
{
"clip_ratio": 0.0,
"completion_length": 598.8125,
"epoch": 27.444444444444443,
"grad_norm": 1.1645802297935302,
"kl": 0.04632568359375,
"learning_rate": 8.678619553365658e-07,
"loss": -0.0278,
"reward": 3.21875,
"reward_std": 1.8432062864303589,
"rewards/accuracy_reward_staging": 0.25,
"rewards/format_reward": 0.96875,
"rewards/format_reward_staging": 1.0,
"step": 110
},
{
"clip_ratio": 0.0,
"completion_length": 610.21875,
"epoch": 27.666666666666668,
"grad_norm": 1.7617563181087859,
"kl": 0.0550537109375,
"learning_rate": 8.650497541989481e-07,
"loss": -0.0219,
"reward": 2.84375,
"reward_std": 1.7233919501304626,
"rewards/accuracy_reward_staging": 0.1875,
"rewards/format_reward": 0.9375,
"rewards/format_reward_staging": 0.96875,
"step": 111
},
{
"clip_ratio": 0.0,
"completion_length": 602.5,
"epoch": 27.88888888888889,
"grad_norm": 1.4790592908519822,
"kl": 0.04412841796875,
"learning_rate": 8.622126023955445e-07,
"loss": 0.0624,
"reward": 3.65625,
"reward_std": 2.0483438968658447,
"rewards/accuracy_reward_staging": 0.34375,
"rewards/format_reward": 0.96875,
"rewards/format_reward_staging": 0.96875,
"step": 112
},
{
"clip_ratio": 0.0,
"completion_length": 602.21875,
"epoch": 28.22222222222222,
"grad_norm": 1.4599487812585739,
"kl": 0.0484619140625,
"learning_rate": 8.593506938419119e-07,
"loss": 0.0459,
"reward": 3.84375,
"reward_std": 0.9925079494714737,
"rewards/accuracy_reward_staging": 0.40625,
"rewards/format_reward": 0.84375,
"rewards/format_reward_staging": 0.96875,
"step": 113
},
{
"clip_ratio": 0.0,
"completion_length": 569.71875,
"epoch": 28.444444444444443,
"grad_norm": 1.3832144010184737,
"kl": 0.0467529296875,
"learning_rate": 8.564642241456986e-07,
"loss": 0.0025,
"reward": 3.71875,
"reward_std": 1.9233438968658447,
"rewards/accuracy_reward_staging": 0.34375,
"rewards/format_reward": 1.0,
"rewards/format_reward_staging": 1.0,
"step": 114
},
{
"clip_ratio": 0.0,
"completion_length": 548.78125,
"epoch": 28.666666666666668,
"grad_norm": 1.803131512178923,
"kl": 0.0576171875,
"learning_rate": 8.535533905932737e-07,
"loss": -0.0187,
"reward": 3.90625,
"reward_std": 2.563826858997345,
"rewards/accuracy_reward_staging": 0.4375,
"rewards/format_reward": 0.75,
"rewards/format_reward_staging": 0.96875,
"step": 115
},
{
"clip_ratio": 0.0,
"completion_length": 564.4375,
"epoch": 28.88888888888889,
"grad_norm": 1.6402368950584498,
"kl": 0.05029296875,
"learning_rate": 8.506183921362442e-07,
"loss": -0.0174,
"reward": 3.3125,
"reward_std": 2.5466037690639496,
"rewards/accuracy_reward_staging": 0.28125,
"rewards/format_reward": 0.9375,
"rewards/format_reward_staging": 0.96875,
"step": 116
},
{
"clip_ratio": 0.0,
"completion_length": 563.5625,
"epoch": 29.22222222222222,
"grad_norm": 1.4985773194128882,
"kl": 0.04840087890625,
"learning_rate": 8.47659429377856e-07,
"loss": -0.0153,
"reward": 3.875,
"reward_std": 2.3320942521095276,
"rewards/accuracy_reward_staging": 0.375,
"rewards/format_reward": 1.0,
"rewards/format_reward_staging": 1.0,
"step": 117
},
{
"clip_ratio": 0.0,
"completion_length": 565.34375,
"epoch": 29.444444444444443,
"grad_norm": 1.7501844147476033,
"kl": 0.05194091796875,
"learning_rate": 8.446767045592829e-07,
"loss": 0.0359,
"reward": 3.84375,
"reward_std": 2.3963494896888733,
"rewards/accuracy_reward_staging": 0.40625,
"rewards/format_reward": 0.84375,
"rewards/format_reward_staging": 0.96875,
"step": 118
},
{
"clip_ratio": 0.0,
"completion_length": 574.5,
"epoch": 29.666666666666668,
"grad_norm": 1.2571401212163673,
"kl": 0.0498046875,
"learning_rate": 8.416704215458042e-07,
"loss": 0.0187,
"reward": 3.3125,
"reward_std": 1.125,
"rewards/accuracy_reward_staging": 0.28125,
"rewards/format_reward": 0.96875,
"rewards/format_reward_staging": 0.9375,
"step": 119
},
{
"clip_ratio": 0.0,
"completion_length": 597.25,
"epoch": 29.88888888888889,
"grad_norm": 1.2235795288016953,
"kl": 0.04754638671875,
"learning_rate": 8.386407858128706e-07,
"loss": -0.0144,
"reward": 3.25,
"reward_std": 1.5358919501304626,
"rewards/accuracy_reward_staging": 0.25,
"rewards/format_reward": 1.0,
"rewards/format_reward_staging": 1.0,
"step": 120
},
{
"clip_ratio": 0.0,
"completion_length": 597.15625,
"epoch": 30.22222222222222,
"grad_norm": 1.6274382257749778,
"kl": 0.060791015625,
"learning_rate": 8.355880044320597e-07,
"loss": 0.0121,
"reward": 3.34375,
"reward_std": 2.7569093704223633,
"rewards/accuracy_reward_staging": 0.34375,
"rewards/format_reward": 0.78125,
"rewards/format_reward_staging": 0.84375,
"step": 121
},
{
"clip_ratio": 0.0,
"completion_length": 592.8125,
"epoch": 30.444444444444443,
"grad_norm": 2.5186927220968895,
"kl": 0.09588623046875,
"learning_rate": 8.325122860569241e-07,
"loss": 0.0081,
"reward": 3.15625,
"reward_std": 2.1270195841789246,
"rewards/accuracy_reward_staging": 0.28125,
"rewards/format_reward": 0.875,
"rewards/format_reward_staging": 0.875,
"step": 122
},
{
"clip_ratio": 0.0,
"completion_length": 564.1875,
"epoch": 30.666666666666668,
"grad_norm": 1.4932442148368137,
"kl": 0.04656982421875,
"learning_rate": 8.294138409087289e-07,
"loss": 0.0298,
"reward": 3.625,
"reward_std": 2.008278489112854,
"rewards/accuracy_reward_staging": 0.34375,
"rewards/format_reward": 0.90625,
"rewards/format_reward_staging": 1.0,
"step": 123
},
{
"clip_ratio": 0.0,
"completion_length": 607.375,
"epoch": 30.88888888888889,
"grad_norm": 3.4718877576698746,
"kl": 0.076904296875,
"learning_rate": 8.262928807620843e-07,
"loss": -0.0234,
"reward": 3.6875,
"reward_std": 2.751339912414551,
"rewards/accuracy_reward_staging": 0.375,
"rewards/format_reward": 0.90625,
"rewards/format_reward_staging": 0.90625,
"step": 124
},
{
"clip_ratio": 0.0,
"completion_length": 531.5625,
"epoch": 31.22222222222222,
"grad_norm": 1.622119125741056,
"kl": 0.05914306640625,
"learning_rate": 8.231496189304704e-07,
"loss": 0.0119,
"reward": 3.78125,
"reward_std": 1.9775724411010742,
"rewards/accuracy_reward_staging": 0.375,
"rewards/format_reward": 0.90625,
"rewards/format_reward_staging": 1.0,
"step": 125
},
{
"clip_ratio": 0.0,
"completion_length": 645.84375,
"epoch": 31.444444444444443,
"grad_norm": 1.6061164218143151,
"kl": 0.0496826171875,
"learning_rate": 8.199842702516582e-07,
"loss": 0.0355,
"reward": 3.90625,
"reward_std": 2.5803541243076324,
"rewards/accuracy_reward_staging": 0.4375,
"rewards/format_reward": 0.78125,
"rewards/format_reward_staging": 0.9375,
"step": 126
},
{
"clip_ratio": 0.0,
"completion_length": 597.3125,
"epoch": 31.666666666666668,
"grad_norm": 1.3457598005679037,
"kl": 0.0526123046875,
"learning_rate": 8.167970510730252e-07,
"loss": -0.0134,
"reward": 3.15625,
"reward_std": 1.8007422089576721,
"rewards/accuracy_reward_staging": 0.25,
"rewards/format_reward": 0.9375,
"rewards/format_reward_staging": 0.96875,
"step": 127
},
{
"clip_ratio": 0.0,
"completion_length": 640.25,
"epoch": 31.88888888888889,
"grad_norm": 1.5569181185599603,
"kl": 0.058349609375,
"learning_rate": 8.135881792367685e-07,
"loss": -0.0192,
"reward": 3.59375,
"reward_std": 1.5271694660186768,
"rewards/accuracy_reward_staging": 0.34375,
"rewards/format_reward": 0.875,
"rewards/format_reward_staging": 1.0,
"step": 128
},
{
"clip_ratio": 0.0,
"completion_length": 574.34375,
"epoch": 32.22222222222222,
"grad_norm": 1.6790409041925978,
"kl": 0.05426025390625,
"learning_rate": 8.103578740650156e-07,
"loss": -0.0013,
"reward": 3.8125,
"reward_std": 2.151860535144806,
"rewards/accuracy_reward_staging": 0.375,
"rewards/format_reward": 0.9375,
"rewards/format_reward_staging": 1.0,
"step": 129
},
{
"clip_ratio": 0.0,
"completion_length": 568.0,
"epoch": 32.44444444444444,
"grad_norm": 1.7164186713447234,
"kl": 0.0628662109375,
"learning_rate": 8.071063563448339e-07,
"loss": 0.0355,
"reward": 3.09375,
"reward_std": 2.110320746898651,
"rewards/accuracy_reward_staging": 0.28125,
"rewards/format_reward": 0.78125,
"rewards/format_reward_staging": 0.90625,
"step": 130
},
{
"clip_ratio": 0.0,
"completion_length": 612.125,
"epoch": 32.666666666666664,
"grad_norm": 1.4484213626473657,
"kl": 0.0438232421875,
"learning_rate": 8.038338483131406e-07,
"loss": 0.0675,
"reward": 2.65625,
"reward_std": 1.5483438968658447,
"rewards/accuracy_reward_staging": 0.15625,
"rewards/format_reward": 0.90625,
"rewards/format_reward_staging": 0.96875,
"step": 131
},
{
"clip_ratio": 0.0,
"completion_length": 633.375,
"epoch": 32.888888888888886,
"grad_norm": 1.4888928164051263,
"kl": 0.046630859375,
"learning_rate": 8.005405736415125e-07,
"loss": 0.003,
"reward": 3.5625,
"reward_std": 2.257579743862152,
"rewards/accuracy_reward_staging": 0.3125,
"rewards/format_reward": 1.0,
"rewards/format_reward_staging": 1.0,
"step": 132
},
{
"clip_ratio": 0.0,
"completion_length": 608.28125,
"epoch": 33.22222222222222,
"grad_norm": 1.4537634594396451,
"kl": 0.05352783203125,
"learning_rate": 7.97226757420899e-07,
"loss": 0.0072,
"reward": 4.53125,
"reward_std": 2.650395154953003,
"rewards/accuracy_reward_staging": 0.53125,
"rewards/format_reward": 0.90625,
"rewards/format_reward_staging": 0.96875,
"step": 133
},
{
"clip_ratio": 0.0,
"completion_length": 589.625,
"epoch": 33.44444444444444,
"grad_norm": 5.103167634384414,
"kl": 0.107421875,
"learning_rate": 7.938926261462365e-07,
"loss": 0.0303,
"reward": 3.96875,
"reward_std": 1.4233438968658447,
"rewards/accuracy_reward_staging": 0.40625,
"rewards/format_reward": 0.9375,
"rewards/format_reward_staging": 1.0,
"step": 134
},
{
"clip_ratio": 0.0,
"completion_length": 576.90625,
"epoch": 33.666666666666664,
"grad_norm": 5.13739196509469,
"kl": 0.09185791015625,
"learning_rate": 7.905384077009692e-07,
"loss": 0.0254,
"reward": 3.40625,
"reward_std": 2.5271694660186768,
"rewards/accuracy_reward_staging": 0.3125,
"rewards/format_reward": 0.875,
"rewards/format_reward_staging": 0.96875,
"step": 135
},
{
"clip_ratio": 0.0,
"completion_length": 565.0,
"epoch": 33.888888888888886,
"grad_norm": 1.3347218031999781,
"kl": 0.05279541015625,
"learning_rate": 7.871643313414718e-07,
"loss": -0.0269,
"reward": 3.78125,
"reward_std": 1.9108592867851257,
"rewards/accuracy_reward_staging": 0.375,
"rewards/format_reward": 0.9375,
"rewards/format_reward_staging": 0.96875,
"step": 136
},
{
"clip_ratio": 0.0,
"completion_length": 596.96875,
"epoch": 34.22222222222222,
"grad_norm": 1.6203773256898213,
"kl": 0.05377197265625,
"learning_rate": 7.837706276813818e-07,
"loss": -0.0507,
"reward": 3.78125,
"reward_std": 2.8475868701934814,
"rewards/accuracy_reward_staging": 0.40625,
"rewards/format_reward": 0.84375,
"rewards/format_reward_staging": 0.90625,
"step": 137
},
{
"clip_ratio": 0.0,
"completion_length": 548.90625,
"epoch": 34.44444444444444,
"grad_norm": 1.7589228637659193,
"kl": 0.0518798828125,
"learning_rate": 7.803575286758363e-07,
"loss": 0.0256,
"reward": 3.84375,
"reward_std": 2.3770764470100403,
"rewards/accuracy_reward_staging": 0.40625,
"rewards/format_reward": 0.875,
"rewards/format_reward_staging": 0.9375,
"step": 138
},
{
"clip_ratio": 0.0,
"completion_length": 574.90625,
"epoch": 34.666666666666664,
"grad_norm": 1.465848261824115,
"kl": 0.05047607421875,
"learning_rate": 7.769252676056186e-07,
"loss": 0.0121,
"reward": 3.0,
"reward_std": 1.999484658241272,
"rewards/accuracy_reward_staging": 0.21875,
"rewards/format_reward": 0.9375,
"rewards/format_reward_staging": 0.96875,
"step": 139
},
{
"clip_ratio": 0.0,
"completion_length": 621.5625,
"epoch": 34.888888888888886,
"grad_norm": 1.699502675045011,
"kl": 0.04669189453125,
"learning_rate": 7.734740790612136e-07,
"loss": -0.0043,
"reward": 3.65625,
"reward_std": 2.740947127342224,
"rewards/accuracy_reward_staging": 0.40625,
"rewards/format_reward": 0.78125,
"rewards/format_reward_staging": 0.84375,
"step": 140
},
{
"clip_ratio": 0.0,
"completion_length": 575.03125,
"epoch": 35.22222222222222,
"grad_norm": 1.4180294898308454,
"kl": 0.04791259765625,
"learning_rate": 7.700041989267736e-07,
"loss": 0.0128,
"reward": 3.9375,
"reward_std": 1.6851893663406372,
"rewards/accuracy_reward_staging": 0.40625,
"rewards/format_reward": 0.90625,
"rewards/format_reward_staging": 1.0,
"step": 141
},
{
"clip_ratio": 0.0,
"completion_length": 634.3125,
"epoch": 35.44444444444444,
"grad_norm": 0.97669552258444,
"kl": 0.04840087890625,
"learning_rate": 7.665158643639969e-07,
"loss": 0.0078,
"reward": 3.90625,
"reward_std": 1.2753951847553253,
"rewards/accuracy_reward_staging": 0.40625,
"rewards/format_reward": 0.875,
"rewards/format_reward_staging": 1.0,
"step": 142
},
{
"clip_ratio": 0.0,
"completion_length": 564.9375,
"epoch": 35.666666666666664,
"grad_norm": 1.4705421421024347,
"kl": 0.0458984375,
"learning_rate": 7.63009313795917e-07,
"loss": 0.0007,
"reward": 3.375,
"reward_std": 1.9858438968658447,
"rewards/accuracy_reward_staging": 0.28125,
"rewards/format_reward": 0.96875,
"rewards/format_reward_staging": 1.0,
"step": 143
},
{
"clip_ratio": 0.0,
"completion_length": 630.8125,
"epoch": 35.888888888888886,
"grad_norm": 1.4040857696410018,
"kl": 0.0491943359375,
"learning_rate": 7.594847868906076e-07,
"loss": 0.0157,
"reward": 4.53125,
"reward_std": 1.881795346736908,
"rewards/accuracy_reward_staging": 0.5625,
"rewards/format_reward": 0.78125,
"rewards/format_reward_staging": 0.9375,
"step": 144
},
{
"clip_ratio": 0.0,
"completion_length": 597.4375,
"epoch": 36.22222222222222,
"grad_norm": 1.7416315495447303,
"kl": 0.05291748046875,
"learning_rate": 7.559425245448005e-07,
"loss": 0.1534,
"reward": 4.125,
"reward_std": 1.7268692255020142,
"rewards/accuracy_reward_staging": 0.46875,
"rewards/format_reward": 0.875,
"rewards/format_reward_staging": 0.90625,
"step": 145
},
{
"clip_ratio": 0.0,
"completion_length": 543.34375,
"epoch": 36.44444444444444,
"grad_norm": 1.3338618690781434,
"kl": 0.05255126953125,
"learning_rate": 7.523827688674219e-07,
"loss": 0.0048,
"reward": 3.46875,
"reward_std": 1.7618454992771149,
"rewards/accuracy_reward_staging": 0.3125,
"rewards/format_reward": 0.9375,
"rewards/format_reward_staging": 0.96875,
"step": 146
},
{
"clip_ratio": 0.0,
"completion_length": 579.625,
"epoch": 36.666666666666664,
"grad_norm": 1.8245501253344487,
"kl": 0.04931640625,
"learning_rate": 7.488057631630437e-07,
"loss": 0.0975,
"reward": 3.78125,
"reward_std": 2.0842358469963074,
"rewards/accuracy_reward_staging": 0.375,
"rewards/format_reward": 0.90625,
"rewards/format_reward_staging": 1.0,
"step": 147
},
{
"clip_ratio": 0.0,
"completion_length": 556.25,
"epoch": 36.888888888888886,
"grad_norm": 1.60039839729214,
"kl": 0.0479736328125,
"learning_rate": 7.452117519152541e-07,
"loss": -0.0225,
"reward": 4.75,
"reward_std": 2.8358521461486816,
"rewards/accuracy_reward_staging": 0.59375,
"rewards/format_reward": 0.84375,
"rewards/format_reward_staging": 0.9375,
"step": 148
},
{
"clip_ratio": 0.0,
"completion_length": 575.59375,
"epoch": 37.22222222222222,
"grad_norm": 1.9262708460562594,
"kl": 0.04852294921875,
"learning_rate": 7.416009807699481e-07,
"loss": 0.0694,
"reward": 3.875,
"reward_std": 2.4488722383975983,
"rewards/accuracy_reward_staging": 0.40625,
"rewards/format_reward": 0.90625,
"rewards/format_reward_staging": 0.9375,
"step": 149
},
{
"epoch": 37.44444444444444,
"grad_norm": 1.4126152849193538,
"learning_rate": 7.379736965185368e-07,
"loss": 0.0461,
"step": 150
},
{
"epoch": 37.44444444444444,
"eval_clip_ratio": 0.0,
"eval_completion_length": 583.05,
"eval_kl": 0.045751953125,
"eval_loss": -0.002990193199366331,
"eval_reward": 2.725,
"eval_reward_std": 1.3047046661376953,
"eval_rewards/accuracy_reward_staging": 0.175,
"eval_rewards/format_reward": 0.875,
"eval_rewards/format_reward_staging": 0.975,
"eval_runtime": 52.1348,
"eval_samples_per_second": 0.691,
"eval_steps_per_second": 0.096,
"step": 150
},
{
"clip_ratio": 0.0,
"completion_length": 566.234375,
"epoch": 37.666666666666664,
"grad_norm": 1.6128745248404066,
"kl": 0.0498046875,
"learning_rate": 7.343301470810807e-07,
"loss": 0.0205,
"reward": 3.8125,
"reward_std": 2.2092738151550293,
"rewards/accuracy_reward_staging": 0.375,
"rewards/format_reward": 0.953125,
"rewards/format_reward_staging": 0.984375,
"step": 151
},
{
"clip_ratio": 0.0,
"completion_length": 561.6875,
"epoch": 37.888888888888886,
"grad_norm": 1.6630192300364095,
"kl": 0.051513671875,
"learning_rate": 7.306705814893439e-07,
"loss": 0.0613,
"reward": 4.75,
"reward_std": 3.510585069656372,
"rewards/accuracy_reward_staging": 0.5625,
"rewards/format_reward": 0.96875,
"rewards/format_reward_staging": 0.96875,
"step": 152
},
{
"clip_ratio": 0.0,
"completion_length": 587.40625,
"epoch": 38.22222222222222,
"grad_norm": 1.5821951322432892,
"kl": 0.0535888671875,
"learning_rate": 7.269952498697734e-07,
"loss": 0.0053,
"reward": 3.78125,
"reward_std": 2.4141127467155457,
"rewards/accuracy_reward_staging": 0.4375,
"rewards/format_reward": 0.75,
"rewards/format_reward_staging": 0.84375,
"step": 153
},
{
"clip_ratio": 0.0,
"completion_length": 572.6875,
"epoch": 38.44444444444444,
"grad_norm": 2.2424195208633453,
"kl": 0.07366943359375,
"learning_rate": 7.233044034264033e-07,
"loss": 0.0315,
"reward": 3.84375,
"reward_std": 2.3134855031967163,
"rewards/accuracy_reward_staging": 0.4375,
"rewards/format_reward": 0.75,
"rewards/format_reward_staging": 0.90625,
"step": 154
},
{
"clip_ratio": 0.0,
"completion_length": 555.9375,
"epoch": 38.666666666666664,
"grad_norm": 1.529067187866317,
"kl": 0.05157470703125,
"learning_rate": 7.195982944236852e-07,
"loss": 0.0321,
"reward": 2.8125,
"reward_std": 1.796603798866272,
"rewards/accuracy_reward_staging": 0.1875,
"rewards/format_reward": 0.9375,
"rewards/format_reward_staging": 0.9375,
"step": 155
},
{
"clip_ratio": 0.0,
"completion_length": 630.0,
"epoch": 38.888888888888886,
"grad_norm": 1.579548286063579,
"kl": 0.050537109375,
"learning_rate": 7.158771761692464e-07,
"loss": 0.0309,
"reward": 4.28125,
"reward_std": 2.8335397839546204,
"rewards/accuracy_reward_staging": 0.5,
"rewards/format_reward": 0.875,
"rewards/format_reward_staging": 0.90625,
"step": 156
},
{
"clip_ratio": 0.0,
"completion_length": 563.0625,
"epoch": 39.22222222222222,
"grad_norm": 1.4963033786464435,
"kl": 0.050048828125,
"learning_rate": 7.121413029965769e-07,
"loss": 0.0482,
"reward": 3.8125,
"reward_std": 2.3843142986297607,
"rewards/accuracy_reward_staging": 0.40625,
"rewards/format_reward": 0.84375,
"rewards/format_reward_staging": 0.9375,
"step": 157
},
{
"clip_ratio": 0.0,
"completion_length": 576.59375,
"epoch": 39.44444444444444,
"grad_norm": 1.4775879396602463,
"kl": 0.054443359375,
"learning_rate": 7.083909302476452e-07,
"loss": 0.0164,
"reward": 3.71875,
"reward_std": 1.9704924821853638,
"rewards/accuracy_reward_staging": 0.40625,
"rewards/format_reward": 0.78125,
"rewards/format_reward_staging": 0.90625,
"step": 158
},
{
"clip_ratio": 0.0,
"completion_length": 555.8125,
"epoch": 39.666666666666664,
"grad_norm": 1.7649876199956425,
"kl": 0.0699462890625,
"learning_rate": 7.04626314255447e-07,
"loss": 0.0019,
"reward": 4.4375,
"reward_std": 2.7981574535369873,
"rewards/accuracy_reward_staging": 0.5,
"rewards/format_reward": 0.9375,
"rewards/format_reward_staging": 1.0,
"step": 159
},
{
"clip_ratio": 0.0,
"completion_length": 498.75,
"epoch": 39.888888888888886,
"grad_norm": 1.3915513369029784,
"kl": 0.0543212890625,
"learning_rate": 7.008477123264847e-07,
"loss": 0.0433,
"reward": 2.90625,
"reward_std": 1.3342358469963074,
"rewards/accuracy_reward_staging": 0.1875,
"rewards/format_reward": 0.96875,
"rewards/format_reward_staging": 1.0,
"step": 160
},
{
"clip_ratio": 0.0,
"completion_length": 569.53125,
"epoch": 40.22222222222222,
"grad_norm": 1.562840542671513,
"kl": 0.053955078125,
"learning_rate": 6.970553827231808e-07,
"loss": 0.0164,
"reward": 4.625,
"reward_std": 2.55762779712677,
"rewards/accuracy_reward_staging": 0.53125,
"rewards/format_reward": 0.96875,
"rewards/format_reward_staging": 1.0,
"step": 161
},
{
"clip_ratio": 0.0,
"completion_length": 538.3125,
"epoch": 40.44444444444444,
"grad_norm": 1.4692239574350316,
"kl": 0.0526123046875,
"learning_rate": 6.932495846462261e-07,
"loss": -0.0164,
"reward": 3.65625,
"reward_std": 1.8189646005630493,
"rewards/accuracy_reward_staging": 0.34375,
"rewards/format_reward": 0.96875,
"rewards/format_reward_staging": 0.96875,
"step": 162
},
{
"clip_ratio": 0.0,
"completion_length": 531.875,
"epoch": 40.666666666666664,
"grad_norm": 1.5332016106483515,
"kl": 0.05316162109375,
"learning_rate": 6.894305782168638e-07,
"loss": -0.0429,
"reward": 4.3125,
"reward_std": 2.5211293697357178,
"rewards/accuracy_reward_staging": 0.46875,
"rewards/format_reward": 0.96875,
"rewards/format_reward_staging": 1.0,
"step": 163
},
{
"clip_ratio": 0.0,
"completion_length": 545.125,
"epoch": 40.888888888888886,
"grad_norm": 13.136996472534078,
"kl": 0.11846923828125,
"learning_rate": 6.855986244591103e-07,
"loss": -0.0235,
"reward": 3.28125,
"reward_std": 2.338345527648926,
"rewards/accuracy_reward_staging": 0.3125,
"rewards/format_reward": 0.8125,
"rewards/format_reward_staging": 0.90625,
"step": 164
},
{
"clip_ratio": 0.0,
"completion_length": 543.40625,
"epoch": 41.22222222222222,
"grad_norm": 1.3242743159265937,
"kl": 0.04998779296875,
"learning_rate": 6.817539852819148e-07,
"loss": 0.0115,
"reward": 3.1875,
"reward_std": 1.375,
"rewards/accuracy_reward_staging": 0.25,
"rewards/format_reward": 0.96875,
"rewards/format_reward_staging": 0.96875,
"step": 165
},
{
"clip_ratio": 0.0,
"completion_length": 520.1875,
"epoch": 41.44444444444444,
"grad_norm": 1.2491437366023406,
"kl": 0.05328369140625,
"learning_rate": 6.778969234612583e-07,
"loss": 0.0198,
"reward": 4.84375,
"reward_std": 1.7444601655006409,
"rewards/accuracy_reward_staging": 0.59375,
"rewards/format_reward": 0.90625,
"rewards/format_reward_staging": 0.96875,
"step": 166
},
{
"clip_ratio": 0.0,
"completion_length": 619.59375,
"epoch": 41.666666666666664,
"grad_norm": 1.6792411977674075,
"kl": 0.05487060546875,
"learning_rate": 6.740277026221922e-07,
"loss": 0.011,
"reward": 3.21875,
"reward_std": 2.509488582611084,
"rewards/accuracy_reward_staging": 0.28125,
"rewards/format_reward": 0.84375,
"rewards/format_reward_staging": 0.96875,
"step": 167
},
{
"clip_ratio": 0.0,
"completion_length": 539.125,
"epoch": 41.888888888888886,
"grad_norm": 2.61987913810964,
"kl": 0.08526611328125,
"learning_rate": 6.701465872208216e-07,
"loss": 0.0355,
"reward": 5.71875,
"reward_std": 2.992280900478363,
"rewards/accuracy_reward_staging": 0.78125,
"rewards/format_reward": 0.8125,
"rewards/format_reward_staging": 1.0,
"step": 168
},
{
"clip_ratio": 0.0,
"completion_length": 601.65625,
"epoch": 42.22222222222222,
"grad_norm": 1.5317010015458066,
"kl": 0.0543212890625,
"learning_rate": 6.662538425262284e-07,
"loss": -0.0412,
"reward": 3.75,
"reward_std": 2.802945911884308,
"rewards/accuracy_reward_staging": 0.375,
"rewards/format_reward": 0.90625,
"rewards/format_reward_staging": 0.96875,
"step": 169
},
{
"clip_ratio": 0.0,
"completion_length": 607.5,
"epoch": 42.44444444444444,
"grad_norm": 1.5445749846218586,
"kl": 0.05462646484375,
"learning_rate": 6.623497346023417e-07,
"loss": -0.0053,
"reward": 3.0625,
"reward_std": 1.4321783781051636,
"rewards/accuracy_reward_staging": 0.25,
"rewards/format_reward": 0.90625,
"rewards/format_reward_staging": 0.90625,
"step": 170
},
{
"clip_ratio": 0.0,
"completion_length": 531.53125,
"epoch": 42.666666666666664,
"grad_norm": 1.727090236953967,
"kl": 0.05303955078125,
"learning_rate": 6.584345302897522e-07,
"loss": 0.0752,
"reward": 4.9375,
"reward_std": 2.6843830347061157,
"rewards/accuracy_reward_staging": 0.59375,
"rewards/format_reward": 0.96875,
"rewards/format_reward_staging": 1.0,
"step": 171
},
{
"clip_ratio": 0.0,
"completion_length": 511.125,
"epoch": 42.888888888888886,
"grad_norm": 1.463526052255072,
"kl": 0.05108642578125,
"learning_rate": 6.545084971874736e-07,
"loss": -0.0218,
"reward": 4.28125,
"reward_std": 2.3289482593536377,
"rewards/accuracy_reward_staging": 0.46875,
"rewards/format_reward": 0.9375,
"rewards/format_reward_staging": 1.0,
"step": 172
},
{
"clip_ratio": 0.0,
"completion_length": 591.46875,
"epoch": 43.22222222222222,
"grad_norm": 1.6155726361043279,
"kl": 0.06103515625,
"learning_rate": 6.505719036346537e-07,
"loss": 0.0385,
"reward": 3.3125,
"reward_std": 2.2124131619930267,
"rewards/accuracy_reward_staging": 0.3125,
"rewards/format_reward": 0.84375,
"rewards/format_reward_staging": 0.90625,
"step": 173
},
{
"clip_ratio": 0.0,
"completion_length": 570.96875,
"epoch": 43.44444444444444,
"grad_norm": 1.3751712901264466,
"kl": 0.0545654296875,
"learning_rate": 6.466250186922324e-07,
"loss": 0.0063,
"reward": 3.1875,
"reward_std": 2.130874752998352,
"rewards/accuracy_reward_staging": 0.3125,
"rewards/format_reward": 0.78125,
"rewards/format_reward_staging": 0.84375,
"step": 174
},
{
"clip_ratio": 0.0,
"completion_length": 545.71875,
"epoch": 43.666666666666664,
"grad_norm": 1.4756595692040109,
"kl": 0.059326171875,
"learning_rate": 6.426681121245527e-07,
"loss": -0.0295,
"reward": 3.59375,
"reward_std": 2.3869778215885162,
"rewards/accuracy_reward_staging": 0.34375,
"rewards/format_reward": 0.9375,
"rewards/format_reward_staging": 0.9375,
"step": 175
},
{
"clip_ratio": 0.0,
"completion_length": 595.1875,
"epoch": 43.888888888888886,
"grad_norm": 1.4156928353056575,
"kl": 0.050048828125,
"learning_rate": 6.387014543809223e-07,
"loss": -0.0245,
"reward": 3.625,
"reward_std": 2.184383064508438,
"rewards/accuracy_reward_staging": 0.34375,
"rewards/format_reward": 0.96875,
"rewards/format_reward_staging": 0.9375,
"step": 176
},
{
"clip_ratio": 0.0,
"completion_length": 549.28125,
"epoch": 44.22222222222222,
"grad_norm": 1.6625979237822903,
"kl": 0.05389404296875,
"learning_rate": 6.347253165771289e-07,
"loss": 0.0393,
"reward": 4.34375,
"reward_std": 2.0728103518486023,
"rewards/accuracy_reward_staging": 0.46875,
"rewards/format_reward": 1.0,
"rewards/format_reward_staging": 1.0,
"step": 177
},
{
"clip_ratio": 0.0,
"completion_length": 616.4375,
"epoch": 44.44444444444444,
"grad_norm": 0.9016620089051227,
"kl": 0.04852294921875,
"learning_rate": 6.307399704769098e-07,
"loss": 0.0327,
"reward": 3.3125,
"reward_std": 1.9239110946655273,
"rewards/accuracy_reward_staging": 0.28125,
"rewards/format_reward": 0.9375,
"rewards/format_reward_staging": 0.96875,
"step": 178
},
{
"clip_ratio": 0.0,
"completion_length": 554.40625,
"epoch": 44.666666666666664,
"grad_norm": 1.4445648538792832,
"kl": 0.06365966796875,
"learning_rate": 6.26745688473377e-07,
"loss": 0.0527,
"reward": 2.90625,
"reward_std": 1.2700245678424835,
"rewards/accuracy_reward_staging": 0.21875,
"rewards/format_reward": 0.875,
"rewards/format_reward_staging": 0.9375,
"step": 179
},
{
"clip_ratio": 0.0,
"completion_length": 596.6875,
"epoch": 44.888888888888886,
"grad_norm": 1.5856705116806837,
"kl": 0.06280517578125,
"learning_rate": 6.227427435703995e-07,
"loss": 0.0488,
"reward": 3.59375,
"reward_std": 2.1598991453647614,
"rewards/accuracy_reward_staging": 0.34375,
"rewards/format_reward": 0.875,
"rewards/format_reward_staging": 1.0,
"step": 180
},
{
"clip_ratio": 0.0,
"completion_length": 540.375,
"epoch": 45.22222222222222,
"grad_norm": 1.4832995345417785,
"kl": 0.0472412109375,
"learning_rate": 6.187314093639443e-07,
"loss": 0.021,
"reward": 3.8125,
"reward_std": 2.2678900957107544,
"rewards/accuracy_reward_staging": 0.375,
"rewards/format_reward": 0.96875,
"rewards/format_reward_staging": 0.96875,
"step": 181
},
{
"clip_ratio": 0.0,
"completion_length": 596.75,
"epoch": 45.44444444444444,
"grad_norm": 1.5871845074006228,
"kl": 0.048828125,
"learning_rate": 6.147119600233758e-07,
"loss": -0.025,
"reward": 4.40625,
"reward_std": 2.732926845550537,
"rewards/accuracy_reward_staging": 0.5,
"rewards/format_reward": 0.96875,
"rewards/format_reward_staging": 0.9375,
"step": 182
},
{
"clip_ratio": 0.0,
"completion_length": 636.21875,
"epoch": 45.666666666666664,
"grad_norm": 1.1682267885626447,
"kl": 0.050537109375,
"learning_rate": 6.106846702727172e-07,
"loss": -0.0041,
"reward": 3.5625,
"reward_std": 1.9367179870605469,
"rewards/accuracy_reward_staging": 0.34375,
"rewards/format_reward": 0.875,
"rewards/format_reward_staging": 0.96875,
"step": 183
},
{
"clip_ratio": 0.0,
"completion_length": 567.6875,
"epoch": 45.888888888888886,
"grad_norm": 1.182505436622169,
"kl": 0.052490234375,
"learning_rate": 6.066498153718734e-07,
"loss": -0.0104,
"reward": 3.96875,
"reward_std": 1.8926886320114136,
"rewards/accuracy_reward_staging": 0.40625,
"rewards/format_reward": 0.96875,
"rewards/format_reward_staging": 0.96875,
"step": 184
},
{
"clip_ratio": 0.0,
"completion_length": 556.84375,
"epoch": 46.22222222222222,
"grad_norm": 74.95843070592915,
"kl": 0.51153564453125,
"learning_rate": 6.026076710978171e-07,
"loss": -0.0099,
"reward": 4.03125,
"reward_std": 2.5020731687545776,
"rewards/accuracy_reward_staging": 0.4375,
"rewards/format_reward": 0.90625,
"rewards/format_reward_staging": 0.9375,
"step": 185
},
{
"clip_ratio": 0.0,
"completion_length": 635.9375,
"epoch": 46.44444444444444,
"grad_norm": 1.1802575443084546,
"kl": 0.046630859375,
"learning_rate": 5.985585137257401e-07,
"loss": -0.0104,
"reward": 3.75,
"reward_std": 1.5358919501304626,
"rewards/accuracy_reward_staging": 0.375,
"rewards/format_reward": 0.9375,
"rewards/format_reward_staging": 0.9375,
"step": 186
},
{
"clip_ratio": 0.0,
"completion_length": 584.625,
"epoch": 46.666666666666664,
"grad_norm": 1.540364554923698,
"kl": 0.053955078125,
"learning_rate": 5.945026200101702e-07,
"loss": 0.0173,
"reward": 3.71875,
"reward_std": 2.7078438997268677,
"rewards/accuracy_reward_staging": 0.375,
"rewards/format_reward": 0.90625,
"rewards/format_reward_staging": 0.9375,
"step": 187
},
{
"clip_ratio": 0.0,
"completion_length": 511.375,
"epoch": 46.888888888888886,
"grad_norm": 1.3487938182691792,
"kl": 0.05859375,
"learning_rate": 5.90440267166055e-07,
"loss": 0.0363,
"reward": 3.125,
"reward_std": 2.2170365154743195,
"rewards/accuracy_reward_staging": 0.28125,
"rewards/format_reward": 0.75,
"rewards/format_reward_staging": 0.96875,
"step": 188
},
{
"clip_ratio": 0.0,
"completion_length": 536.40625,
"epoch": 47.22222222222222,
"grad_norm": 1.7030200868844614,
"kl": 0.054931640625,
"learning_rate": 5.863717328498152e-07,
"loss": 0.0328,
"reward": 3.84375,
"reward_std": 2.070079743862152,
"rewards/accuracy_reward_staging": 0.40625,
"rewards/format_reward": 0.84375,
"rewards/format_reward_staging": 0.96875,
"step": 189
},
{
"clip_ratio": 0.0,
"completion_length": 555.875,
"epoch": 47.44444444444444,
"grad_norm": 1.7566836455673576,
"kl": 0.05218505859375,
"learning_rate": 5.82297295140367e-07,
"loss": -0.0381,
"reward": 3.75,
"reward_std": 2.009314328432083,
"rewards/accuracy_reward_staging": 0.375,
"rewards/format_reward": 0.90625,
"rewards/format_reward_staging": 0.96875,
"step": 190
},
{
"clip_ratio": 0.0,
"completion_length": 545.8125,
"epoch": 47.666666666666664,
"grad_norm": 1.594063347049537,
"kl": 0.05426025390625,
"learning_rate": 5.782172325201155e-07,
"loss": 0.0535,
"reward": 3.21875,
"reward_std": 1.7700316905975342,
"rewards/accuracy_reward_staging": 0.25,
"rewards/format_reward": 0.96875,
"rewards/format_reward_staging": 1.0,
"step": 191
},
{
"clip_ratio": 0.0,
"completion_length": 622.8125,
"epoch": 47.888888888888886,
"grad_norm": 1.5439318867500331,
"kl": 0.04937744140625,
"learning_rate": 5.741318238559209e-07,
"loss": -0.0012,
"reward": 4.75,
"reward_std": 2.4349581599235535,
"rewards/accuracy_reward_staging": 0.5625,
"rewards/format_reward": 0.9375,
"rewards/format_reward_staging": 1.0,
"step": 192
},
{
"clip_ratio": 0.0,
"completion_length": 583.75,
"epoch": 48.22222222222222,
"grad_norm": 2.5201319810344454,
"kl": 0.0770263671875,
"learning_rate": 5.700413483800389e-07,
"loss": -0.0762,
"reward": 3.4375,
"reward_std": 1.82216876745224,
"rewards/accuracy_reward_staging": 0.34375,
"rewards/format_reward": 0.8125,
"rewards/format_reward_staging": 0.90625,
"step": 193
},
{
"clip_ratio": 0.0,
"completion_length": 599.59375,
"epoch": 48.44444444444444,
"grad_norm": 1.473198815087056,
"kl": 0.05352783203125,
"learning_rate": 5.659460856710345e-07,
"loss": -0.0055,
"reward": 3.5625,
"reward_std": 1.9599019289016724,
"rewards/accuracy_reward_staging": 0.34375,
"rewards/format_reward": 0.90625,
"rewards/format_reward_staging": 0.9375,
"step": 194
},
{
"clip_ratio": 0.0,
"completion_length": 569.75,
"epoch": 48.666666666666664,
"grad_norm": 1.6168573027198114,
"kl": 0.05010986328125,
"learning_rate": 5.618463156346739e-07,
"loss": -0.0075,
"reward": 4.21875,
"reward_std": 1.739636391401291,
"rewards/accuracy_reward_staging": 0.46875,
"rewards/format_reward": 0.9375,
"rewards/format_reward_staging": 0.9375,
"step": 195
},
{
"clip_ratio": 0.0,
"completion_length": 577.375,
"epoch": 48.888888888888886,
"grad_norm": 1.5839729942600627,
"kl": 0.04180908203125,
"learning_rate": 5.577423184847931e-07,
"loss": 0.0086,
"reward": 3.875,
"reward_std": 2.332531690597534,
"rewards/accuracy_reward_staging": 0.375,
"rewards/format_reward": 1.0,
"rewards/format_reward_staging": 1.0,
"step": 196
},
{
"clip_ratio": 0.0,
"completion_length": 532.96875,
"epoch": 49.22222222222222,
"grad_norm": 1.5767088515541903,
"kl": 0.04962158203125,
"learning_rate": 5.536343747241459e-07,
"loss": 0.0159,
"reward": 4.15625,
"reward_std": 1.9809716939926147,
"rewards/accuracy_reward_staging": 0.4375,
"rewards/format_reward": 0.96875,
"rewards/format_reward_staging": 1.0,
"step": 197
},
{
"clip_ratio": 0.0,
"completion_length": 578.28125,
"epoch": 49.44444444444444,
"grad_norm": 1.3049917889915577,
"kl": 0.04583740234375,
"learning_rate": 5.495227651252315e-07,
"loss": 0.0386,
"reward": 4.53125,
"reward_std": 1.7373294830322266,
"rewards/accuracy_reward_staging": 0.53125,
"rewards/format_reward": 0.90625,
"rewards/format_reward_staging": 0.96875,
"step": 198
},
{
"clip_ratio": 0.0,
"completion_length": 546.0625,
"epoch": 49.666666666666664,
"grad_norm": 1.3164741992532543,
"kl": 0.0504150390625,
"learning_rate": 5.454077707111041e-07,
"loss": 0.0142,
"reward": 4.65625,
"reward_std": 1.945079743862152,
"rewards/accuracy_reward_staging": 0.53125,
"rewards/format_reward": 1.0,
"rewards/format_reward_staging": 1.0,
"step": 199
},
{
"epoch": 49.888888888888886,
"grad_norm": 1.3350172420738584,
"learning_rate": 5.412896727361662e-07,
"loss": 0.0656,
"step": 200
},
{
"epoch": 49.888888888888886,
"eval_clip_ratio": 0.0,
"eval_completion_length": 600.85,
"eval_kl": 0.047802734375,
"eval_loss": 0.025471828877925873,
"eval_reward": 2.6,
"eval_reward_std": 1.3353363513946532,
"eval_rewards/accuracy_reward_staging": 0.15,
"eval_rewards/format_reward": 0.9,
"eval_rewards/format_reward_staging": 0.95,
"eval_runtime": 52.2669,
"eval_samples_per_second": 0.689,
"eval_steps_per_second": 0.096,
"step": 200
},
{
"clip_ratio": 0.0,
"completion_length": 614.453125,
"epoch": 50.22222222222222,
"grad_norm": 1.284721283794701,
"kl": 0.05389404296875,
"learning_rate": 5.371687526669439e-07,
"loss": 0.0086,
"reward": 3.421875,
"reward_std": 2.202674761414528,
"rewards/accuracy_reward_staging": 0.34375,
"rewards/format_reward": 0.8125,
"rewards/format_reward_staging": 0.890625,
"step": 201
},
{
"clip_ratio": 0.0,
"completion_length": 545.53125,
"epoch": 50.44444444444444,
"grad_norm": 1.235503506247465,
"kl": 0.0528564453125,
"learning_rate": 5.330452921628497e-07,
"loss": -0.0137,
"reward": 3.5625,
"reward_std": 1.246154248714447,
"rewards/accuracy_reward_staging": 0.34375,
"rewards/format_reward": 0.875,
"rewards/format_reward_staging": 0.96875,
"step": 202
},
{
"clip_ratio": 0.0,
"completion_length": 560.25,
"epoch": 50.666666666666664,
"grad_norm": 1.9492031211380043,
"kl": 0.0654296875,
"learning_rate": 5.28919573056932e-07,
"loss": -0.049,
"reward": 4.28125,
"reward_std": 2.934589922428131,
"rewards/accuracy_reward_staging": 0.5,
"rewards/format_reward": 0.875,
"rewards/format_reward_staging": 0.90625,
"step": 203
},
{
"clip_ratio": 0.0,
"completion_length": 599.3125,
"epoch": 50.888888888888886,
"grad_norm": 1.567290502007258,
"kl": 0.04364013671875,
"learning_rate": 5.247918773366111e-07,
"loss": 0.0937,
"reward": 3.875,
"reward_std": 1.930722177028656,
"rewards/accuracy_reward_staging": 0.40625,
"rewards/format_reward": 0.90625,
"rewards/format_reward_staging": 0.9375,
"step": 204
},
{
"clip_ratio": 0.0,
"completion_length": 562.6875,
"epoch": 51.22222222222222,
"grad_norm": 1.4477817793212922,
"kl": 0.05084228515625,
"learning_rate": 5.206624871244065e-07,
"loss": 0.0148,
"reward": 2.90625,
"reward_std": 1.4091877937316895,
"rewards/accuracy_reward_staging": 0.1875,
"rewards/format_reward": 0.96875,
"rewards/format_reward_staging": 1.0,
"step": 205
},
{
"clip_ratio": 0.0,
"completion_length": 605.625,
"epoch": 51.44444444444444,
"grad_norm": 1.5674813338685252,
"kl": 0.04931640625,
"learning_rate": 5.165316846586541e-07,
"loss": 0.0963,
"reward": 3.125,
"reward_std": 2.1649354100227356,
"rewards/accuracy_reward_staging": 0.28125,
"rewards/format_reward": 0.78125,
"rewards/format_reward_staging": 0.9375,
"step": 206
},
{
"clip_ratio": 0.0,
"completion_length": 572.5625,
"epoch": 51.666666666666664,
"grad_norm": 1.521375079838418,
"kl": 0.046875,
"learning_rate": 5.123997522742151e-07,
"loss": 0.0215,
"reward": 3.71875,
"reward_std": 2.047757565975189,
"rewards/accuracy_reward_staging": 0.375,
"rewards/format_reward": 0.90625,
"rewards/format_reward_staging": 0.9375,
"step": 207
},
{
"clip_ratio": 0.0,
"completion_length": 524.4375,
"epoch": 51.888888888888886,
"grad_norm": 1.637742061840183,
"kl": 0.04779052734375,
"learning_rate": 5.082669723831793e-07,
"loss": -0.0249,
"reward": 3.59375,
"reward_std": 2.858625650405884,
"rewards/accuracy_reward_staging": 0.34375,
"rewards/format_reward": 0.90625,
"rewards/format_reward_staging": 0.96875,
"step": 208
},
{
"clip_ratio": 0.0,
"completion_length": 566.875,
"epoch": 52.22222222222222,
"grad_norm": 1.5832843072882397,
"kl": 0.04449462890625,
"learning_rate": 5.041336274555625e-07,
"loss": -0.063,
"reward": 2.84375,
"reward_std": 1.2771694660186768,
"rewards/accuracy_reward_staging": 0.1875,
"rewards/format_reward": 0.9375,
"rewards/format_reward_staging": 0.96875,
"step": 209
},
{
"clip_ratio": 0.0,
"completion_length": 576.0625,
"epoch": 52.44444444444444,
"grad_norm": 1.5508316387978383,
"kl": 0.06103515625,
"learning_rate": 5e-07,
"loss": -0.0291,
"reward": 4.0,
"reward_std": 2.082531690597534,
"rewards/accuracy_reward_staging": 0.40625,
"rewards/format_reward": 0.96875,
"rewards/format_reward_staging": 1.0,
"step": 210
},
{
"clip_ratio": 0.0,
"completion_length": 611.71875,
"epoch": 52.666666666666664,
"grad_norm": 1.6164552079690877,
"kl": 0.04437255859375,
"learning_rate": 4.958663725444375e-07,
"loss": 0.0102,
"reward": 4.40625,
"reward_std": 2.5580477714538574,
"rewards/accuracy_reward_staging": 0.5,
"rewards/format_reward": 0.9375,
"rewards/format_reward_staging": 0.96875,
"step": 211
},
{
"clip_ratio": 0.0,
"completion_length": 563.3125,
"epoch": 52.888888888888886,
"grad_norm": 1.5726439650006456,
"kl": 0.05096435546875,
"learning_rate": 4.917330276168208e-07,
"loss": -0.0031,
"reward": 4.96875,
"reward_std": 2.3175911903381348,
"rewards/accuracy_reward_staging": 0.625,
"rewards/format_reward": 0.84375,
"rewards/format_reward_staging": 1.0,
"step": 212
},
{
"clip_ratio": 0.0,
"completion_length": 616.375,
"epoch": 53.22222222222222,
"grad_norm": 1.7880025106936461,
"kl": 0.04498291015625,
"learning_rate": 4.87600247725785e-07,
"loss": 0.066,
"reward": 3.1875,
"reward_std": 1.891027882695198,
"rewards/accuracy_reward_staging": 0.28125,
"rewards/format_reward": 0.875,
"rewards/format_reward_staging": 0.90625,
"step": 213
},
{
"clip_ratio": 0.0,
"completion_length": 556.6875,
"epoch": 53.44444444444444,
"grad_norm": 2.0232137942713573,
"kl": 0.0498046875,
"learning_rate": 4.834683153413459e-07,
"loss": 0.0311,
"reward": 3.5625,
"reward_std": 1.6434174478054047,
"rewards/accuracy_reward_staging": 0.34375,
"rewards/format_reward": 0.875,
"rewards/format_reward_staging": 0.96875,
"step": 214
},
{
"clip_ratio": 0.0,
"completion_length": 555.09375,
"epoch": 53.666666666666664,
"grad_norm": 1.4253180533139413,
"kl": 0.0416259765625,
"learning_rate": 4.793375128755933e-07,
"loss": -0.0401,
"reward": 4.03125,
"reward_std": 2.570079743862152,
"rewards/accuracy_reward_staging": 0.40625,
"rewards/format_reward": 1.0,
"rewards/format_reward_staging": 1.0,
"step": 215
},
{
"clip_ratio": 0.0,
"completion_length": 597.4375,
"epoch": 53.888888888888886,
"grad_norm": 1.740974206086713,
"kl": 0.04815673828125,
"learning_rate": 4.752081226633888e-07,
"loss": -0.038,
"reward": 4.34375,
"reward_std": 2.6059716939926147,
"rewards/accuracy_reward_staging": 0.5,
"rewards/format_reward": 0.875,
"rewards/format_reward_staging": 0.96875,
"step": 216
},
{
"clip_ratio": 0.0,
"completion_length": 564.0625,
"epoch": 54.22222222222222,
"grad_norm": 1.6032171003103113,
"kl": 0.05572509765625,
"learning_rate": 4.71080426943068e-07,
"loss": 0.0092,
"reward": 3.0625,
"reward_std": 1.996816635131836,
"rewards/accuracy_reward_staging": 0.28125,
"rewards/format_reward": 0.75,
"rewards/format_reward_staging": 0.90625,
"step": 217
},
{
"clip_ratio": 0.0,
"completion_length": 530.03125,
"epoch": 54.44444444444444,
"grad_norm": 1.3127356050754018,
"kl": 0.0540771484375,
"learning_rate": 4.669547078371503e-07,
"loss": -0.0245,
"reward": 6.59375,
"reward_std": 2.073159486055374,
"rewards/accuracy_reward_staging": 0.9375,
"rewards/format_reward": 0.9375,
"rewards/format_reward_staging": 0.96875,
"step": 218
},
{
"clip_ratio": 0.0,
"completion_length": 608.3125,
"epoch": 54.666666666666664,
"grad_norm": 1.7016991990394592,
"kl": 0.05010986328125,
"learning_rate": 4.628312473330562e-07,
"loss": 0.0702,
"reward": 3.875,
"reward_std": 2.482748866081238,
"rewards/accuracy_reward_staging": 0.40625,
"rewards/format_reward": 0.875,
"rewards/format_reward_staging": 0.96875,
"step": 219
},
{
"clip_ratio": 0.0,
"completion_length": 549.375,
"epoch": 54.888888888888886,
"grad_norm": 1.3141608271515532,
"kl": 0.04742431640625,
"learning_rate": 4.5871032726383385e-07,
"loss": 0.0552,
"reward": 3.125,
"reward_std": 1.3886407911777496,
"rewards/accuracy_reward_staging": 0.25,
"rewards/format_reward": 0.90625,
"rewards/format_reward_staging": 0.96875,
"step": 220
},
{
"clip_ratio": 0.0,
"completion_length": 578.71875,
"epoch": 55.22222222222222,
"grad_norm": 1.3932263109990592,
"kl": 0.04364013671875,
"learning_rate": 4.5459222928889587e-07,
"loss": 0.051,
"reward": 3.71875,
"reward_std": 1.7805703282356262,
"rewards/accuracy_reward_staging": 0.375,
"rewards/format_reward": 0.90625,
"rewards/format_reward_staging": 0.9375,
"step": 221
},
{
"clip_ratio": 0.0,
"completion_length": 588.4375,
"epoch": 55.44444444444444,
"grad_norm": 1.5339621078239263,
"kl": 0.04962158203125,
"learning_rate": 4.5047723487476864e-07,
"loss": -0.0216,
"reward": 3.46875,
"reward_std": 2.488185405731201,
"rewards/accuracy_reward_staging": 0.3125,
"rewards/format_reward": 0.90625,
"rewards/format_reward_staging": 1.0,
"step": 222
},
{
"clip_ratio": 0.0,
"completion_length": 585.34375,
"epoch": 55.666666666666664,
"grad_norm": 1.6607509386936015,
"kl": 0.04962158203125,
"learning_rate": 4.463656252758542e-07,
"loss": 0.0452,
"reward": 3.8125,
"reward_std": 2.171033263206482,
"rewards/accuracy_reward_staging": 0.40625,
"rewards/format_reward": 0.84375,
"rewards/format_reward_staging": 0.9375,
"step": 223
},
{
"clip_ratio": 0.0,
"completion_length": 631.75,
"epoch": 55.888888888888886,
"grad_norm": 1.5614778713632624,
"kl": 0.04669189453125,
"learning_rate": 4.4225768151520694e-07,
"loss": 0.0801,
"reward": 3.5625,
"reward_std": 2.430722177028656,
"rewards/accuracy_reward_staging": 0.34375,
"rewards/format_reward": 0.90625,
"rewards/format_reward_staging": 0.9375,
"step": 224
},
{
"clip_ratio": 0.0,
"completion_length": 613.75,
"epoch": 56.22222222222222,
"grad_norm": 1.5004046938088074,
"kl": 0.05950927734375,
"learning_rate": 4.381536843653261e-07,
"loss": 0.0698,
"reward": 3.59375,
"reward_std": 2.5734615325927734,
"rewards/accuracy_reward_staging": 0.375,
"rewards/format_reward": 0.78125,
"rewards/format_reward_staging": 0.9375,
"step": 225
},
{
"clip_ratio": 0.0,
"completion_length": 565.375,
"epoch": 56.44444444444444,
"grad_norm": 1.3766714019303354,
"kl": 0.04168701171875,
"learning_rate": 4.340539143289655e-07,
"loss": 0.0233,
"reward": 3.5,
"reward_std": 2.0,
"rewards/accuracy_reward_staging": 0.3125,
"rewards/format_reward": 0.96875,
"rewards/format_reward_staging": 0.96875,
"step": 226
},
{
"clip_ratio": 0.0,
"completion_length": 553.125,
"epoch": 56.666666666666664,
"grad_norm": 1.307050706736634,
"kl": 0.05133056640625,
"learning_rate": 4.2995865161996104e-07,
"loss": 0.0181,
"reward": 4.0625,
"reward_std": 2.421202301979065,
"rewards/accuracy_reward_staging": 0.4375,
"rewards/format_reward": 0.90625,
"rewards/format_reward_staging": 0.96875,
"step": 227
},
{
"clip_ratio": 0.0,
"completion_length": 516.4375,
"epoch": 56.888888888888886,
"grad_norm": 1.5405733998671278,
"kl": 0.0562744140625,
"learning_rate": 4.258681761440789e-07,
"loss": 0.0017,
"reward": 4.03125,
"reward_std": 2.49512779712677,
"rewards/accuracy_reward_staging": 0.40625,
"rewards/format_reward": 1.0,
"rewards/format_reward_staging": 1.0,
"step": 228
},
{
"clip_ratio": 0.0,
"completion_length": 621.84375,
"epoch": 57.22222222222222,
"grad_norm": 1.606949877632979,
"kl": 0.044189453125,
"learning_rate": 4.2178276747988444e-07,
"loss": -0.0076,
"reward": 4.3125,
"reward_std": 2.390491783618927,
"rewards/accuracy_reward_staging": 0.5,
"rewards/format_reward": 0.90625,
"rewards/format_reward_staging": 0.90625,
"step": 229
},
{
"clip_ratio": 0.0,
"completion_length": 597.84375,
"epoch": 57.44444444444444,
"grad_norm": 1.5411205221206894,
"kl": 0.0574951171875,
"learning_rate": 4.1770270485963294e-07,
"loss": -0.0387,
"reward": 3.125,
"reward_std": 2.1638975143432617,
"rewards/accuracy_reward_staging": 0.25,
"rewards/format_reward": 0.90625,
"rewards/format_reward_staging": 0.96875,
"step": 230
},
{
"clip_ratio": 0.0,
"completion_length": 603.1875,
"epoch": 57.666666666666664,
"grad_norm": 1.3383276534008064,
"kl": 0.04473876953125,
"learning_rate": 4.1362826715018497e-07,
"loss": 0.0122,
"reward": 3.6875,
"reward_std": 1.9202269613742828,
"rewards/accuracy_reward_staging": 0.375,
"rewards/format_reward": 0.90625,
"rewards/format_reward_staging": 0.90625,
"step": 231
},
{
"clip_ratio": 0.0,
"completion_length": 552.25,
"epoch": 57.888888888888886,
"grad_norm": 1.7484795613881616,
"kl": 0.06341552734375,
"learning_rate": 4.095597328339452e-07,
"loss": -0.0426,
"reward": 4.46875,
"reward_std": 2.5560158491134644,
"rewards/accuracy_reward_staging": 0.53125,
"rewards/format_reward": 0.84375,
"rewards/format_reward_staging": 0.96875,
"step": 232
},
{
"clip_ratio": 0.0,
"completion_length": 585.125,
"epoch": 58.22222222222222,
"grad_norm": 1.5442704440086175,
"kl": 0.05377197265625,
"learning_rate": 4.0549737998982994e-07,
"loss": -0.0062,
"reward": 3.65625,
"reward_std": 2.2512659430503845,
"rewards/accuracy_reward_staging": 0.34375,
"rewards/format_reward": 0.96875,
"rewards/format_reward_staging": 0.96875,
"step": 233
},
{
"clip_ratio": 0.0,
"completion_length": 601.59375,
"epoch": 58.44444444444444,
"grad_norm": 1.3070749287077408,
"kl": 0.05706787109375,
"learning_rate": 4.0144148627425986e-07,
"loss": 0.0357,
"reward": 4.5625,
"reward_std": 2.173893690109253,
"rewards/accuracy_reward_staging": 0.53125,
"rewards/format_reward": 0.90625,
"rewards/format_reward_staging": 1.0,
"step": 234
},
{
"clip_ratio": 0.0,
"completion_length": 549.25,
"epoch": 58.666666666666664,
"grad_norm": 1.568215525888831,
"kl": 0.04644775390625,
"learning_rate": 3.973923289021829e-07,
"loss": -0.0236,
"reward": 3.375,
"reward_std": 2.125,
"rewards/accuracy_reward_staging": 0.28125,
"rewards/format_reward": 0.96875,
"rewards/format_reward_staging": 1.0,
"step": 235
},
{
"clip_ratio": 0.0,
"completion_length": 559.875,
"epoch": 58.888888888888886,
"grad_norm": 1.247655763308189,
"kl": 0.05523681640625,
"learning_rate": 3.9335018462812664e-07,
"loss": 0.0335,
"reward": 4.40625,
"reward_std": 1.7515006065368652,
"rewards/accuracy_reward_staging": 0.5,
"rewards/format_reward": 0.9375,
"rewards/format_reward_staging": 0.96875,
"step": 236
},
{
"clip_ratio": 0.0,
"completion_length": 608.75,
"epoch": 59.22222222222222,
"grad_norm": 1.4876134624852135,
"kl": 0.05291748046875,
"learning_rate": 3.893153297272828e-07,
"loss": 0.0246,
"reward": 3.28125,
"reward_std": 1.5280899405479431,
"rewards/accuracy_reward_staging": 0.3125,
"rewards/format_reward": 0.78125,
"rewards/format_reward_staging": 0.9375,
"step": 237
},
{
"clip_ratio": 0.0,
"completion_length": 583.0,
"epoch": 59.44444444444444,
"grad_norm": 1.6243663627358595,
"kl": 0.04718017578125,
"learning_rate": 3.8528803997662423e-07,
"loss": -0.0226,
"reward": 4.5625,
"reward_std": 2.9370444416999817,
"rewards/accuracy_reward_staging": 0.53125,
"rewards/format_reward": 0.9375,
"rewards/format_reward_staging": 0.96875,
"step": 238
},
{
"clip_ratio": 0.0,
"completion_length": 584.90625,
"epoch": 59.666666666666664,
"grad_norm": 1.579154131750563,
"kl": 0.05328369140625,
"learning_rate": 3.812685906360557e-07,
"loss": -0.0118,
"reward": 3.5625,
"reward_std": 1.8252411782741547,
"rewards/accuracy_reward_staging": 0.34375,
"rewards/format_reward": 0.875,
"rewards/format_reward_staging": 0.96875,
"step": 239
},
{
"clip_ratio": 0.0,
"completion_length": 602.875,
"epoch": 59.888888888888886,
"grad_norm": 1.6568742956735238,
"kl": 0.05029296875,
"learning_rate": 3.772572564296004e-07,
"loss": 0.0049,
"reward": 4.21875,
"reward_std": 2.6711304783821106,
"rewards/accuracy_reward_staging": 0.46875,
"rewards/format_reward": 0.90625,
"rewards/format_reward_staging": 0.96875,
"step": 240
},
{
"clip_ratio": 0.0,
"completion_length": 527.71875,
"epoch": 60.22222222222222,
"grad_norm": 1.5110623474715636,
"kl": 0.05316162109375,
"learning_rate": 3.7325431152662294e-07,
"loss": 0.004,
"reward": 3.65625,
"reward_std": 2.44047012925148,
"rewards/accuracy_reward_staging": 0.34375,
"rewards/format_reward": 0.96875,
"rewards/format_reward_staging": 0.96875,
"step": 241
},
{
"clip_ratio": 0.0,
"completion_length": 607.71875,
"epoch": 60.44444444444444,
"grad_norm": 1.5588742571636938,
"kl": 0.05126953125,
"learning_rate": 3.692600295230901e-07,
"loss": 0.0174,
"reward": 4.125,
"reward_std": 2.93262779712677,
"rewards/accuracy_reward_staging": 0.4375,
"rewards/format_reward": 0.96875,
"rewards/format_reward_staging": 0.96875,
"step": 242
},
{
"clip_ratio": 0.0,
"completion_length": 598.375,
"epoch": 60.666666666666664,
"grad_norm": 1.4200468362196192,
"kl": 0.05487060546875,
"learning_rate": 3.6527468342287096e-07,
"loss": 0.1256,
"reward": 3.8125,
"reward_std": 2.782258152961731,
"rewards/accuracy_reward_staging": 0.40625,
"rewards/format_reward": 0.875,
"rewards/format_reward_staging": 0.90625,
"step": 243
},
{
"clip_ratio": 0.0,
"completion_length": 598.9375,
"epoch": 60.888888888888886,
"grad_norm": 2.112230364965324,
"kl": 0.06414794921875,
"learning_rate": 3.612985456190778e-07,
"loss": -0.0099,
"reward": 4.0625,
"reward_std": 2.503733992576599,
"rewards/accuracy_reward_staging": 0.4375,
"rewards/format_reward": 0.875,
"rewards/format_reward_staging": 1.0,
"step": 244
},
{
"clip_ratio": 0.0,
"completion_length": 580.84375,
"epoch": 61.22222222222222,
"grad_norm": 1.5256554050376716,
"kl": 0.0540771484375,
"learning_rate": 3.5733188787544746e-07,
"loss": 0.0285,
"reward": 3.75,
"reward_std": 2.553140878677368,
"rewards/accuracy_reward_staging": 0.375,
"rewards/format_reward": 0.90625,
"rewards/format_reward_staging": 0.96875,
"step": 245
},
{
"clip_ratio": 0.0,
"completion_length": 553.53125,
"epoch": 61.44444444444444,
"grad_norm": 1.5714805767176323,
"kl": 0.0645751953125,
"learning_rate": 3.533749813077677e-07,
"loss": 0.0666,
"reward": 4.71875,
"reward_std": 2.595756232738495,
"rewards/accuracy_reward_staging": 0.59375,
"rewards/format_reward": 0.8125,
"rewards/format_reward_staging": 0.9375,
"step": 246
},
{
"clip_ratio": 0.0,
"completion_length": 570.90625,
"epoch": 61.666666666666664,
"grad_norm": 1.3717833382169582,
"kl": 0.05242919921875,
"learning_rate": 3.4942809636534633e-07,
"loss": 0.0464,
"reward": 4.375,
"reward_std": 1.9917186498641968,
"rewards/accuracy_reward_staging": 0.5,
"rewards/format_reward": 0.90625,
"rewards/format_reward_staging": 0.96875,
"step": 247
},
{
"clip_ratio": 0.0,
"completion_length": 648.6875,
"epoch": 61.888888888888886,
"grad_norm": 1.281888219474357,
"kl": 0.05694580078125,
"learning_rate": 3.454915028125263e-07,
"loss": -0.0053,
"reward": 4.1875,
"reward_std": 1.8432075381278992,
"rewards/accuracy_reward_staging": 0.46875,
"rewards/format_reward": 0.9375,
"rewards/format_reward_staging": 0.90625,
"step": 248
},
{
"clip_ratio": 0.0,
"completion_length": 601.28125,
"epoch": 62.22222222222222,
"grad_norm": 1.2189149322070956,
"kl": 0.05279541015625,
"learning_rate": 3.415654697102478e-07,
"loss": -0.0095,
"reward": 3.65625,
"reward_std": 1.4233438968658447,
"rewards/accuracy_reward_staging": 0.34375,
"rewards/format_reward": 0.96875,
"rewards/format_reward_staging": 0.96875,
"step": 249
},
{
"epoch": 62.44444444444444,
"grad_norm": 1.7869776388477054,
"learning_rate": 3.3765026539765827e-07,
"loss": 0.0694,
"step": 250
},
{
"epoch": 62.44444444444444,
"eval_clip_ratio": 0.0,
"eval_completion_length": 597.85,
"eval_kl": 0.050439453125,
"eval_loss": 0.033870112150907516,
"eval_reward": 2.5,
"eval_reward_std": 1.5911447525024414,
"eval_rewards/accuracy_reward_staging": 0.15,
"eval_rewards/format_reward": 0.825,
"eval_rewards/format_reward_staging": 0.925,
"eval_runtime": 53.5113,
"eval_samples_per_second": 0.673,
"eval_steps_per_second": 0.093,
"step": 250
},
{
"clip_ratio": 0.0,
"completion_length": 543.40625,
"epoch": 62.666666666666664,
"grad_norm": 1.580273768681387,
"kl": 0.058380126953125,
"learning_rate": 3.337461574737716e-07,
"loss": 0.0381,
"reward": 3.59375,
"reward_std": 1.963532954454422,
"rewards/accuracy_reward_staging": 0.359375,
"rewards/format_reward": 0.859375,
"rewards/format_reward_staging": 0.9375,
"step": 251
},
{
"clip_ratio": 0.0,
"completion_length": 629.9375,
"epoch": 62.888888888888886,
"grad_norm": 1.4446074061408753,
"kl": 0.04742431640625,
"learning_rate": 3.2985341277917846e-07,
"loss": 0.0576,
"reward": 3.5625,
"reward_std": 1.8048822581768036,
"rewards/accuracy_reward_staging": 0.34375,
"rewards/format_reward": 0.875,
"rewards/format_reward_staging": 0.96875,
"step": 252
},
{
"clip_ratio": 0.0,
"completion_length": 610.03125,
"epoch": 63.22222222222222,
"grad_norm": 2.4797282896452084,
"kl": 0.06011962890625,
"learning_rate": 3.2597229737780774e-07,
"loss": 0.0258,
"reward": 2.71875,
"reward_std": 1.841366171836853,
"rewards/accuracy_reward_staging": 0.21875,
"rewards/format_reward": 0.78125,
"rewards/format_reward_staging": 0.84375,
"step": 253
},
{
"clip_ratio": 0.0,
"completion_length": 535.8125,
"epoch": 63.44444444444444,
"grad_norm": 1.413545282954978,
"kl": 0.04827880859375,
"learning_rate": 3.221030765387417e-07,
"loss": 0.0266,
"reward": 4.0,
"reward_std": 1.7409893572330475,
"rewards/accuracy_reward_staging": 0.4375,
"rewards/format_reward": 0.90625,
"rewards/format_reward_staging": 0.90625,
"step": 254
},
{
"clip_ratio": 0.0,
"completion_length": 625.125,
"epoch": 63.666666666666664,
"grad_norm": 1.4981329871397806,
"kl": 0.04962158203125,
"learning_rate": 3.1824601471808497e-07,
"loss": 0.0841,
"reward": 4.5625,
"reward_std": 3.0762142539024353,
"rewards/accuracy_reward_staging": 0.5625,
"rewards/format_reward": 0.8125,
"rewards/format_reward_staging": 0.9375,
"step": 255
},
{
"clip_ratio": 0.0,
"completion_length": 633.125,
"epoch": 63.888888888888886,
"grad_norm": 1.4534244133457102,
"kl": 0.04656982421875,
"learning_rate": 3.1440137554088953e-07,
"loss": 0.029,
"reward": 3.84375,
"reward_std": 2.296931117773056,
"rewards/accuracy_reward_staging": 0.375,
"rewards/format_reward": 0.96875,
"rewards/format_reward_staging": 1.0,
"step": 256
},
{
"clip_ratio": 0.0,
"completion_length": 519.59375,
"epoch": 64.22222222222223,
"grad_norm": 1.586114819510263,
"kl": 0.05950927734375,
"learning_rate": 3.1056942178313604e-07,
"loss": 0.0666,
"reward": 4.375,
"reward_std": 2.7632179856300354,
"rewards/accuracy_reward_staging": 0.5,
"rewards/format_reward": 0.9375,
"rewards/format_reward_staging": 0.9375,
"step": 257
},
{
"clip_ratio": 0.0,
"completion_length": 554.5,
"epoch": 64.44444444444444,
"grad_norm": 1.4820895514814123,
"kl": 0.057373046875,
"learning_rate": 3.06750415353774e-07,
"loss": 0.015,
"reward": 4.34375,
"reward_std": 2.6667675375938416,
"rewards/accuracy_reward_staging": 0.46875,
"rewards/format_reward": 1.0,
"rewards/format_reward_staging": 1.0,
"step": 258
},
{
"clip_ratio": 0.0,
"completion_length": 594.3125,
"epoch": 64.66666666666667,
"grad_norm": 1.4710703551980364,
"kl": 0.05108642578125,
"learning_rate": 3.029446172768193e-07,
"loss": -0.0532,
"reward": 3.71875,
"reward_std": 1.9592358469963074,
"rewards/accuracy_reward_staging": 0.375,
"rewards/format_reward": 0.875,
"rewards/format_reward_staging": 0.96875,
"step": 259
},
{
"clip_ratio": 0.0,
"completion_length": 615.125,
"epoch": 64.88888888888889,
"grad_norm": 1.195494273136427,
"kl": 0.05078125,
"learning_rate": 2.9915228767351535e-07,
"loss": -0.0471,
"reward": 3.71875,
"reward_std": 1.5842358469963074,
"rewards/accuracy_reward_staging": 0.34375,
"rewards/format_reward": 1.0,
"rewards/format_reward_staging": 1.0,
"step": 260
},
{
"clip_ratio": 0.0,
"completion_length": 582.78125,
"epoch": 65.22222222222223,
"grad_norm": 1.0142470745003664,
"kl": 0.0562744140625,
"learning_rate": 2.9537368574455303e-07,
"loss": 0.0116,
"reward": 3.90625,
"reward_std": 1.3764855861663818,
"rewards/accuracy_reward_staging": 0.40625,
"rewards/format_reward": 0.90625,
"rewards/format_reward_staging": 0.96875,
"step": 261
},
{
"clip_ratio": 0.0,
"completion_length": 551.5625,
"epoch": 65.44444444444444,
"grad_norm": 1.2621208103940496,
"kl": 0.045166015625,
"learning_rate": 2.916090697523549e-07,
"loss": 0.0065,
"reward": 3.5625,
"reward_std": 1.8217839002609253,
"rewards/accuracy_reward_staging": 0.3125,
"rewards/format_reward": 1.0,
"rewards/format_reward_staging": 1.0,
"step": 262
},
{
"clip_ratio": 0.0,
"completion_length": 639.625,
"epoch": 65.66666666666667,
"grad_norm": 1.2178177535688726,
"kl": 0.06982421875,
"learning_rate": 2.878586970034232e-07,
"loss": 0.0063,
"reward": 2.8125,
"reward_std": 1.2878219783306122,
"rewards/accuracy_reward_staging": 0.21875,
"rewards/format_reward": 0.84375,
"rewards/format_reward_staging": 0.875,
"step": 263
},
{
"clip_ratio": 0.0,
"completion_length": 582.875,
"epoch": 65.88888888888889,
"grad_norm": 1.6224844954097977,
"kl": 0.04864501953125,
"learning_rate": 2.841228238307536e-07,
"loss": -0.0201,
"reward": 5.03125,
"reward_std": 2.2327269315719604,
"rewards/accuracy_reward_staging": 0.625,
"rewards/format_reward": 0.9375,
"rewards/format_reward_staging": 0.96875,
"step": 264
},
{
"clip_ratio": 0.0,
"completion_length": 637.15625,
"epoch": 66.22222222222223,
"grad_norm": 1.2308430687264216,
"kl": 0.05279541015625,
"learning_rate": 2.8040170557631485e-07,
"loss": 0.0153,
"reward": 3.46875,
"reward_std": 2.0372338593006134,
"rewards/accuracy_reward_staging": 0.3125,
"rewards/format_reward": 0.9375,
"rewards/format_reward_staging": 0.96875,
"step": 265
},
{
"clip_ratio": 0.0,
"completion_length": 600.4375,
"epoch": 66.44444444444444,
"grad_norm": 1.5321111568180714,
"kl": 0.04827880859375,
"learning_rate": 2.7669559657359673e-07,
"loss": -0.0491,
"reward": 3.8125,
"reward_std": 2.4646694660186768,
"rewards/accuracy_reward_staging": 0.375,
"rewards/format_reward": 0.9375,
"rewards/format_reward_staging": 1.0,
"step": 266
},
{
"clip_ratio": 0.0,
"completion_length": 615.5,
"epoch": 66.66666666666667,
"grad_norm": 1.3679824700888612,
"kl": 0.05462646484375,
"learning_rate": 2.730047501302266e-07,
"loss": 0.0308,
"reward": 3.09375,
"reward_std": 2.642750769853592,
"rewards/accuracy_reward_staging": 0.28125,
"rewards/format_reward": 0.78125,
"rewards/format_reward_staging": 0.90625,
"step": 267
},
{
"clip_ratio": 0.0,
"completion_length": 549.5,
"epoch": 66.88888888888889,
"grad_norm": 1.5033503223897624,
"kl": 0.0577392578125,
"learning_rate": 2.6932941851065615e-07,
"loss": -0.0215,
"reward": 4.9375,
"reward_std": 2.482675850391388,
"rewards/accuracy_reward_staging": 0.59375,
"rewards/format_reward": 0.96875,
"rewards/format_reward_staging": 1.0,
"step": 268
},
{
"clip_ratio": 0.0,
"completion_length": 553.34375,
"epoch": 67.22222222222223,
"grad_norm": 1.4290897914516596,
"kl": 0.05340576171875,
"learning_rate": 2.656698529189193e-07,
"loss": 0.0366,
"reward": 3.78125,
"reward_std": 1.9895031452178955,
"rewards/accuracy_reward_staging": 0.375,
"rewards/format_reward": 0.90625,
"rewards/format_reward_staging": 1.0,
"step": 269
},
{
"clip_ratio": 0.0,
"completion_length": 575.1875,
"epoch": 67.44444444444444,
"grad_norm": 1.579184850033335,
"kl": 0.0518798828125,
"learning_rate": 2.620263034814632e-07,
"loss": 0.0078,
"reward": 4.4375,
"reward_std": 2.323539137840271,
"rewards/accuracy_reward_staging": 0.53125,
"rewards/format_reward": 0.875,
"rewards/format_reward_staging": 0.90625,
"step": 270
},
{
"clip_ratio": 0.0,
"completion_length": 584.34375,
"epoch": 67.66666666666667,
"grad_norm": 1.482511162141472,
"kl": 0.0482177734375,
"learning_rate": 2.58399019230052e-07,
"loss": -0.0587,
"reward": 3.6875,
"reward_std": 2.195499747991562,
"rewards/accuracy_reward_staging": 0.34375,
"rewards/format_reward": 0.96875,
"rewards/format_reward_staging": 1.0,
"step": 271
},
{
"clip_ratio": 0.0,
"completion_length": 592.0,
"epoch": 67.88888888888889,
"grad_norm": 1.4652114217200525,
"kl": 0.049560546875,
"learning_rate": 2.547882480847461e-07,
"loss": 0.0021,
"reward": 3.1875,
"reward_std": 2.073539137840271,
"rewards/accuracy_reward_staging": 0.25,
"rewards/format_reward": 0.96875,
"rewards/format_reward_staging": 0.96875,
"step": 272
},
{
"clip_ratio": 0.0,
"completion_length": 598.59375,
"epoch": 68.22222222222223,
"grad_norm": 1.5345326135427537,
"kl": 0.04913330078125,
"learning_rate": 2.5119423683695657e-07,
"loss": -0.0357,
"reward": 4.25,
"reward_std": 2.9848236441612244,
"rewards/accuracy_reward_staging": 0.46875,
"rewards/format_reward": 0.9375,
"rewards/format_reward_staging": 0.96875,
"step": 273
},
{
"clip_ratio": 0.0,
"completion_length": 597.4375,
"epoch": 68.44444444444444,
"grad_norm": 1.5990838171502337,
"kl": 0.061279296875,
"learning_rate": 2.476172311325783e-07,
"loss": 0.0292,
"reward": 5.1875,
"reward_std": 2.957588255405426,
"rewards/accuracy_reward_staging": 0.6875,
"rewards/format_reward": 0.84375,
"rewards/format_reward_staging": 0.90625,
"step": 274
},
{
"clip_ratio": 0.0,
"completion_length": 632.71875,
"epoch": 68.66666666666667,
"grad_norm": 2.3340038254897566,
"kl": 0.06951904296875,
"learning_rate": 2.440574754551996e-07,
"loss": 0.0246,
"reward": 3.5,
"reward_std": 2.0238241851329803,
"rewards/accuracy_reward_staging": 0.3125,
"rewards/format_reward": 0.9375,
"rewards/format_reward_staging": 1.0,
"step": 275
},
{
"clip_ratio": 0.0,
"completion_length": 648.0625,
"epoch": 68.88888888888889,
"grad_norm": 1.5884432300379054,
"kl": 0.04443359375,
"learning_rate": 2.4051521310939254e-07,
"loss": 0.1177,
"reward": 4.0,
"reward_std": 1.8069141209125519,
"rewards/accuracy_reward_staging": 0.4375,
"rewards/format_reward": 0.90625,
"rewards/format_reward_staging": 0.90625,
"step": 276
},
{
"clip_ratio": 0.0,
"completion_length": 546.625,
"epoch": 69.22222222222223,
"grad_norm": 2.985922020759926,
"kl": 0.10992431640625,
"learning_rate": 2.3699068620408301e-07,
"loss": 0.0152,
"reward": 3.15625,
"reward_std": 1.511039137840271,
"rewards/accuracy_reward_staging": 0.25,
"rewards/format_reward": 0.90625,
"rewards/format_reward_staging": 1.0,
"step": 277
},
{
"clip_ratio": 0.0,
"completion_length": 602.25,
"epoch": 69.44444444444444,
"grad_norm": 1.5923870878518707,
"kl": 0.056396484375,
"learning_rate": 2.3348413563600323e-07,
"loss": 0.0176,
"reward": 4.5,
"reward_std": 2.31710484623909,
"rewards/accuracy_reward_staging": 0.53125,
"rewards/format_reward": 0.875,
"rewards/format_reward_staging": 0.96875,
"step": 278
},
{
"clip_ratio": 0.0,
"completion_length": 598.46875,
"epoch": 69.66666666666667,
"grad_norm": 1.5951806343259411,
"kl": 0.04925537109375,
"learning_rate": 2.2999580107322654e-07,
"loss": 0.0929,
"reward": 4.9375,
"reward_std": 2.494741439819336,
"rewards/accuracy_reward_staging": 0.59375,
"rewards/format_reward": 0.96875,
"rewards/format_reward_staging": 1.0,
"step": 279
},
{
"clip_ratio": 0.0,
"completion_length": 520.6875,
"epoch": 69.88888888888889,
"grad_norm": 1.5129561001363085,
"kl": 0.0699462890625,
"learning_rate": 2.2652592093878665e-07,
"loss": 0.0125,
"reward": 4.25,
"reward_std": 1.878759890794754,
"rewards/accuracy_reward_staging": 0.5,
"rewards/format_reward": 0.8125,
"rewards/format_reward_staging": 0.9375,
"step": 280
},
{
"clip_ratio": 0.0,
"completion_length": 645.0,
"epoch": 70.22222222222223,
"grad_norm": 1.5182810808110982,
"kl": 0.0643310546875,
"learning_rate": 2.2307473239438152e-07,
"loss": 0.01,
"reward": 4.40625,
"reward_std": 2.5910332798957825,
"rewards/accuracy_reward_staging": 0.5,
"rewards/format_reward": 0.90625,
"rewards/format_reward_staging": 1.0,
"step": 281
},
{
"clip_ratio": 0.0,
"completion_length": 569.21875,
"epoch": 70.44444444444444,
"grad_norm": 1.8382342741040079,
"kl": 0.05499267578125,
"learning_rate": 2.1964247132416368e-07,
"loss": 0.0019,
"reward": 4.40625,
"reward_std": 3.0214737951755524,
"rewards/accuracy_reward_staging": 0.5625,
"rewards/format_reward": 0.6875,
"rewards/format_reward_staging": 0.90625,
"step": 282
},
{
"clip_ratio": 0.0,
"completion_length": 608.0,
"epoch": 70.66666666666667,
"grad_norm": 1.7202842060510593,
"kl": 0.04736328125,
"learning_rate": 2.1622937231861822e-07,
"loss": 0.0307,
"reward": 3.375,
"reward_std": 2.42453271150589,
"rewards/accuracy_reward_staging": 0.3125,
"rewards/format_reward": 0.9375,
"rewards/format_reward_staging": 0.875,
"step": 283
},
{
"clip_ratio": 0.0,
"completion_length": 600.9375,
"epoch": 70.88888888888889,
"grad_norm": 1.4517912073118557,
"kl": 0.04290771484375,
"learning_rate": 2.128356686585282e-07,
"loss": 0.0476,
"reward": 3.75,
"reward_std": 1.7858919501304626,
"rewards/accuracy_reward_staging": 0.375,
"rewards/format_reward": 0.9375,
"rewards/format_reward_staging": 0.9375,
"step": 284
},
{
"clip_ratio": 0.0,
"completion_length": 649.53125,
"epoch": 71.22222222222223,
"grad_norm": 1.259142583692514,
"kl": 0.0467529296875,
"learning_rate": 2.0946159229903088e-07,
"loss": 0.0839,
"reward": 2.84375,
"reward_std": 1.5846085250377655,
"rewards/accuracy_reward_staging": 0.21875,
"rewards/format_reward": 0.875,
"rewards/format_reward_staging": 0.875,
"step": 285
},
{
"clip_ratio": 0.0,
"completion_length": 625.875,
"epoch": 71.44444444444444,
"grad_norm": 1.340679975407133,
"kl": 0.0565185546875,
"learning_rate": 2.0610737385376348e-07,
"loss": 0.0085,
"reward": 3.59375,
"reward_std": 1.975972980260849,
"rewards/accuracy_reward_staging": 0.375,
"rewards/format_reward": 0.84375,
"rewards/format_reward_staging": 0.875,
"step": 286
},
{
"clip_ratio": 0.0,
"completion_length": 603.34375,
"epoch": 71.66666666666667,
"grad_norm": 1.4289962866267603,
"kl": 0.06005859375,
"learning_rate": 2.0277324257910106e-07,
"loss": 0.0185,
"reward": 5.5,
"reward_std": 2.4536279439926147,
"rewards/accuracy_reward_staging": 0.75,
"rewards/format_reward": 0.8125,
"rewards/format_reward_staging": 0.9375,
"step": 287
},
{
"clip_ratio": 0.0,
"completion_length": 558.25,
"epoch": 71.88888888888889,
"grad_norm": 1.6936868571901433,
"kl": 0.0546875,
"learning_rate": 1.9945942635848745e-07,
"loss": 0.0145,
"reward": 3.78125,
"reward_std": 2.1591877937316895,
"rewards/accuracy_reward_staging": 0.375,
"rewards/format_reward": 0.9375,
"rewards/format_reward_staging": 0.96875,
"step": 288
},
{
"clip_ratio": 0.0,
"completion_length": 636.28125,
"epoch": 72.22222222222223,
"grad_norm": 1.3457751552584203,
"kl": 0.0472412109375,
"learning_rate": 1.9616615168685942e-07,
"loss": 0.0082,
"reward": 3.375,
"reward_std": 1.7216877937316895,
"rewards/accuracy_reward_staging": 0.28125,
"rewards/format_reward": 0.96875,
"rewards/format_reward_staging": 1.0,
"step": 289
},
{
"clip_ratio": 0.0,
"completion_length": 565.1875,
"epoch": 72.44444444444444,
"grad_norm": 1.1708504647450504,
"kl": 0.0599365234375,
"learning_rate": 1.9289364365516607e-07,
"loss": 0.015,
"reward": 4.46875,
"reward_std": 1.2958193719387054,
"rewards/accuracy_reward_staging": 0.53125,
"rewards/format_reward": 0.875,
"rewards/format_reward_staging": 0.9375,
"step": 290
},
{
"clip_ratio": 0.0,
"completion_length": 597.125,
"epoch": 72.66666666666667,
"grad_norm": 2.00516555564966,
"kl": 0.065185546875,
"learning_rate": 1.896421259349844e-07,
"loss": 0.0357,
"reward": 4.21875,
"reward_std": 2.589491307735443,
"rewards/accuracy_reward_staging": 0.46875,
"rewards/format_reward": 0.90625,
"rewards/format_reward_staging": 0.96875,
"step": 291
},
{
"clip_ratio": 0.0,
"completion_length": 672.75,
"epoch": 72.88888888888889,
"grad_norm": 2.8218244852832335,
"kl": 0.09649658203125,
"learning_rate": 1.8641182076323148e-07,
"loss": -0.0058,
"reward": 5.03125,
"reward_std": 3.2576534748077393,
"rewards/accuracy_reward_staging": 0.625,
"rewards/format_reward": 0.9375,
"rewards/format_reward_staging": 0.96875,
"step": 292
},
{
"clip_ratio": 0.0,
"completion_length": 577.65625,
"epoch": 73.22222222222223,
"grad_norm": 1.7581055322994823,
"kl": 0.06195068359375,
"learning_rate": 1.8320294892697475e-07,
"loss": 0.0534,
"reward": 3.0,
"reward_std": 2.1200742721557617,
"rewards/accuracy_reward_staging": 0.28125,
"rewards/format_reward": 0.75,
"rewards/format_reward_staging": 0.84375,
"step": 293
},
{
"clip_ratio": 0.0,
"completion_length": 562.6875,
"epoch": 73.44444444444444,
"grad_norm": 1.5050675135024016,
"kl": 0.0499267578125,
"learning_rate": 1.8001572974834168e-07,
"loss": 0.0343,
"reward": 4.0,
"reward_std": 1.9108919501304626,
"rewards/accuracy_reward_staging": 0.40625,
"rewards/format_reward": 0.96875,
"rewards/format_reward_staging": 1.0,
"step": 294
},
{
"clip_ratio": 0.0,
"completion_length": 590.5,
"epoch": 73.66666666666667,
"grad_norm": 12.895158631725321,
"kl": 0.12432861328125,
"learning_rate": 1.768503810695295e-07,
"loss": 0.0513,
"reward": 3.46875,
"reward_std": 1.6672459840774536,
"rewards/accuracy_reward_staging": 0.3125,
"rewards/format_reward": 0.90625,
"rewards/format_reward_staging": 1.0,
"step": 295
},
{
"clip_ratio": 0.0,
"completion_length": 609.125,
"epoch": 73.88888888888889,
"grad_norm": 1.7151482870021748,
"kl": 0.07269287109375,
"learning_rate": 1.7370711923791564e-07,
"loss": -0.0106,
"reward": 5.625,
"reward_std": 2.8527393341064453,
"rewards/accuracy_reward_staging": 0.78125,
"rewards/format_reward": 0.84375,
"rewards/format_reward_staging": 0.875,
"step": 296
},
{
"clip_ratio": 0.0,
"completion_length": 675.34375,
"epoch": 74.22222222222223,
"grad_norm": 1.6055158333490096,
"kl": 0.0538330078125,
"learning_rate": 1.70586159091271e-07,
"loss": 0.0916,
"reward": 3.53125,
"reward_std": 2.737855911254883,
"rewards/accuracy_reward_staging": 0.40625,
"rewards/format_reward": 0.6875,
"rewards/format_reward_staging": 0.8125,
"step": 297
},
{
"clip_ratio": 0.0,
"completion_length": 638.0,
"epoch": 74.44444444444444,
"grad_norm": 1.3927154732757459,
"kl": 0.0494384765625,
"learning_rate": 1.674877139430758e-07,
"loss": -0.0039,
"reward": 3.5,
"reward_std": 2.132579743862152,
"rewards/accuracy_reward_staging": 0.3125,
"rewards/format_reward": 0.96875,
"rewards/format_reward_staging": 0.96875,
"step": 298
},
{
"clip_ratio": 0.0,
"completion_length": 597.78125,
"epoch": 74.66666666666667,
"grad_norm": 1.2941317675033293,
"kl": 0.05804443359375,
"learning_rate": 1.6441199556794034e-07,
"loss": 0.0582,
"reward": 3.28125,
"reward_std": 2.0324151515960693,
"rewards/accuracy_reward_staging": 0.28125,
"rewards/format_reward": 0.90625,
"rewards/format_reward_staging": 0.96875,
"step": 299
},
{
"epoch": 74.88888888888889,
"grad_norm": 1.1895166775660873,
"learning_rate": 1.6135921418712955e-07,
"loss": 0.0154,
"step": 300
},
{
"epoch": 74.88888888888889,
"eval_clip_ratio": 0.0,
"eval_completion_length": 558.5,
"eval_kl": 0.055908203125,
"eval_loss": 0.04830198734998703,
"eval_reward": 3.25,
"eval_reward_std": 2.227747082710266,
"eval_rewards/accuracy_reward_staging": 0.275,
"eval_rewards/format_reward": 0.9,
"eval_rewards/format_reward_staging": 0.975,
"eval_runtime": 50.8525,
"eval_samples_per_second": 0.708,
"eval_steps_per_second": 0.098,
"step": 300
},
{
"clip_ratio": 0.0,
"completion_length": 522.46875,
"epoch": 75.22222222222223,
"grad_norm": 1.2719829976418617,
"kl": 0.05950927734375,
"learning_rate": 1.5832957845419582e-07,
"loss": -0.0239,
"reward": 4.078125,
"reward_std": 1.734619602560997,
"rewards/accuracy_reward_staging": 0.4375,
"rewards/format_reward": 0.921875,
"rewards/format_reward_staging": 0.96875,
"step": 301
},
{
"clip_ratio": 0.0,
"completion_length": 531.75,
"epoch": 75.44444444444444,
"grad_norm": 1.4701942608061176,
"kl": 0.05584716796875,
"learning_rate": 1.553232954407171e-07,
"loss": -0.0222,
"reward": 4.46875,
"reward_std": 1.8445461988449097,
"rewards/accuracy_reward_staging": 0.5,
"rewards/format_reward": 0.96875,
"rewards/format_reward_staging": 1.0,
"step": 302
},
{
"clip_ratio": 0.0,
"completion_length": 606.6875,
"epoch": 75.66666666666667,
"grad_norm": 0.979841248868734,
"kl": 0.0506591796875,
"learning_rate": 1.52340570622144e-07,
"loss": 0.0094,
"reward": 4.34375,
"reward_std": 1.0341877937316895,
"rewards/accuracy_reward_staging": 0.46875,
"rewards/format_reward": 1.0,
"rewards/format_reward_staging": 1.0,
"step": 303
},
{
"clip_ratio": 0.0,
"completion_length": 645.0,
"epoch": 75.88888888888889,
"grad_norm": 1.2907279139619887,
"kl": 0.05084228515625,
"learning_rate": 1.493816078637557e-07,
"loss": 0.0349,
"reward": 4.03125,
"reward_std": 2.768365204334259,
"rewards/accuracy_reward_staging": 0.4375,
"rewards/format_reward": 0.90625,
"rewards/format_reward_staging": 0.9375,
"step": 304
},
{
"clip_ratio": 0.0,
"completion_length": 612.03125,
"epoch": 76.22222222222223,
"grad_norm": 1.3052082852261886,
"kl": 0.06219482421875,
"learning_rate": 1.4644660940672627e-07,
"loss": 0.0241,
"reward": 3.90625,
"reward_std": 1.5625,
"rewards/accuracy_reward_staging": 0.40625,
"rewards/format_reward": 0.875,
"rewards/format_reward_staging": 1.0,
"step": 305
},
{
"clip_ratio": 0.0,
"completion_length": 552.0,
"epoch": 76.44444444444444,
"grad_norm": 1.6816507380806482,
"kl": 0.0640869140625,
"learning_rate": 1.435357758543015e-07,
"loss": 0.0623,
"reward": 3.5,
"reward_std": 2.3343209326267242,
"rewards/accuracy_reward_staging": 0.34375,
"rewards/format_reward": 0.875,
"rewards/format_reward_staging": 0.90625,
"step": 306
},
{
"clip_ratio": 0.0,
"completion_length": 605.96875,
"epoch": 76.66666666666667,
"grad_norm": 1.7963549332670843,
"kl": 0.05462646484375,
"learning_rate": 1.4064930615808806e-07,
"loss": -0.0141,
"reward": 3.90625,
"reward_std": 3.359531879425049,
"rewards/accuracy_reward_staging": 0.40625,
"rewards/format_reward": 0.90625,
"rewards/format_reward_staging": 0.96875,
"step": 307
},
{
"clip_ratio": 0.0,
"completion_length": 559.5625,
"epoch": 76.88888888888889,
"grad_norm": 1.3270350222684457,
"kl": 0.0548095703125,
"learning_rate": 1.3778739760445552e-07,
"loss": 0.0232,
"reward": 3.53125,
"reward_std": 2.031329423189163,
"rewards/accuracy_reward_staging": 0.34375,
"rewards/format_reward": 0.875,
"rewards/format_reward_staging": 0.9375,
"step": 308
},
{
"clip_ratio": 0.0,
"completion_length": 574.9375,
"epoch": 77.22222222222223,
"grad_norm": 1.403581677625956,
"kl": 0.0579833984375,
"learning_rate": 1.349502458010519e-07,
"loss": 0.0045,
"reward": 3.40625,
"reward_std": 1.5280899405479431,
"rewards/accuracy_reward_staging": 0.3125,
"rewards/format_reward": 0.90625,
"rewards/format_reward_staging": 0.9375,
"step": 309
},
{
"clip_ratio": 0.0,
"completion_length": 571.4375,
"epoch": 77.44444444444444,
"grad_norm": 1.4518085181139868,
"kl": 0.05694580078125,
"learning_rate": 1.321380446634342e-07,
"loss": -0.0332,
"reward": 4.53125,
"reward_std": 2.796904981136322,
"rewards/accuracy_reward_staging": 0.5625,
"rewards/format_reward": 0.8125,
"rewards/format_reward_staging": 0.90625,
"step": 310
},
{
"clip_ratio": 0.0,
"completion_length": 592.375,
"epoch": 77.66666666666667,
"grad_norm": 1.4193852483613092,
"kl": 0.04937744140625,
"learning_rate": 1.2935098640181457e-07,
"loss": 0.0097,
"reward": 3.71875,
"reward_std": 1.6591877937316895,
"rewards/accuracy_reward_staging": 0.34375,
"rewards/format_reward": 1.0,
"rewards/format_reward_staging": 1.0,
"step": 311
},
{
"clip_ratio": 0.0,
"completion_length": 576.5625,
"epoch": 77.88888888888889,
"grad_norm": 1.5563017115814217,
"kl": 0.055419921875,
"learning_rate": 1.2658926150792322e-07,
"loss": 0.0595,
"reward": 4.03125,
"reward_std": 2.8135814666748047,
"rewards/accuracy_reward_staging": 0.4375,
"rewards/format_reward": 0.875,
"rewards/format_reward_staging": 0.96875,
"step": 312
},
{
"clip_ratio": 0.0,
"completion_length": 618.15625,
"epoch": 78.22222222222223,
"grad_norm": 1.7591845192775064,
"kl": 0.05523681640625,
"learning_rate": 1.2385305874198775e-07,
"loss": -0.0554,
"reward": 2.625,
"reward_std": 1.8215623199939728,
"rewards/accuracy_reward_staging": 0.1875,
"rewards/format_reward": 0.78125,
"rewards/format_reward_staging": 0.90625,
"step": 313
},
{
"clip_ratio": 0.0,
"completion_length": 554.40625,
"epoch": 78.44444444444444,
"grad_norm": 1.532211221198721,
"kl": 0.04986572265625,
"learning_rate": 1.2114256511983274e-07,
"loss": 0.0323,
"reward": 4.40625,
"reward_std": 2.975598633289337,
"rewards/accuracy_reward_staging": 0.5,
"rewards/format_reward": 0.9375,
"rewards/format_reward_staging": 0.96875,
"step": 314
},
{
"clip_ratio": 0.0,
"completion_length": 563.71875,
"epoch": 78.66666666666667,
"grad_norm": 1.827273461906554,
"kl": 0.054931640625,
"learning_rate": 1.1845796590009683e-07,
"loss": 0.1089,
"reward": 4.28125,
"reward_std": 2.9560980796813965,
"rewards/accuracy_reward_staging": 0.5,
"rewards/format_reward": 0.875,
"rewards/format_reward_staging": 0.90625,
"step": 315
},
{
"clip_ratio": 0.0,
"completion_length": 607.125,
"epoch": 78.88888888888889,
"grad_norm": 1.5705329008308428,
"kl": 0.05218505859375,
"learning_rate": 1.1579944457157059e-07,
"loss": 0.0714,
"reward": 3.53125,
"reward_std": 2.3595376014709473,
"rewards/accuracy_reward_staging": 0.375,
"rewards/format_reward": 0.8125,
"rewards/format_reward_staging": 0.84375,
"step": 316
},
{
"clip_ratio": 0.0,
"completion_length": 576.59375,
"epoch": 79.22222222222223,
"grad_norm": 1.2784333227971698,
"kl": 0.04998779296875,
"learning_rate": 1.1316718284065535e-07,
"loss": -0.0327,
"reward": 3.1875,
"reward_std": 1.75,
"rewards/accuracy_reward_staging": 0.25,
"rewards/format_reward": 0.96875,
"rewards/format_reward_staging": 0.96875,
"step": 317
},
{
"clip_ratio": 0.0,
"completion_length": 598.6875,
"epoch": 79.44444444444444,
"grad_norm": 1.2996658268690655,
"kl": 0.05255126953125,
"learning_rate": 1.1056136061894384e-07,
"loss": -0.0387,
"reward": 4.5,
"reward_std": 1.6467358469963074,
"rewards/accuracy_reward_staging": 0.53125,
"rewards/format_reward": 0.875,
"rewards/format_reward_staging": 0.96875,
"step": 318
},
{
"clip_ratio": 0.0,
"completion_length": 576.25,
"epoch": 79.66666666666667,
"grad_norm": 1.5574614421463486,
"kl": 0.04937744140625,
"learning_rate": 1.0798215601092353e-07,
"loss": 0.0303,
"reward": 4.375,
"reward_std": 2.325068473815918,
"rewards/accuracy_reward_staging": 0.5,
"rewards/format_reward": 0.90625,
"rewards/format_reward_staging": 0.96875,
"step": 319
},
{
"clip_ratio": 0.0,
"completion_length": 626.0625,
"epoch": 79.88888888888889,
"grad_norm": 1.5754896085780978,
"kl": 0.0546875,
"learning_rate": 1.0542974530180327e-07,
"loss": 0.0137,
"reward": 4.1875,
"reward_std": 2.3722406029701233,
"rewards/accuracy_reward_staging": 0.46875,
"rewards/format_reward": 0.90625,
"rewards/format_reward_staging": 0.9375,
"step": 320
},
{
"clip_ratio": 0.0,
"completion_length": 550.71875,
"epoch": 80.22222222222223,
"grad_norm": 1.610726394417215,
"kl": 0.06024169921875,
"learning_rate": 1.0290430294546448e-07,
"loss": 0.013,
"reward": 3.90625,
"reward_std": 2.4335986375808716,
"rewards/accuracy_reward_staging": 0.40625,
"rewards/format_reward": 0.9375,
"rewards/format_reward_staging": 0.9375,
"step": 321
},
{
"clip_ratio": 0.0,
"completion_length": 655.25,
"epoch": 80.44444444444444,
"grad_norm": 1.4911116339078845,
"kl": 0.0523681640625,
"learning_rate": 1.0040600155253764e-07,
"loss": 0.0332,
"reward": 2.78125,
"reward_std": 1.3004322350025177,
"rewards/accuracy_reward_staging": 0.1875,
"rewards/format_reward": 0.875,
"rewards/format_reward_staging": 0.96875,
"step": 322
},
{
"clip_ratio": 0.0,
"completion_length": 519.3125,
"epoch": 80.66666666666667,
"grad_norm": 1.6907992973095978,
"kl": 0.0538330078125,
"learning_rate": 9.793501187860431e-08,
"loss": -0.0401,
"reward": 4.0,
"reward_std": 2.362515449523926,
"rewards/accuracy_reward_staging": 0.40625,
"rewards/format_reward": 0.96875,
"rewards/format_reward_staging": 1.0,
"step": 323
},
{
"clip_ratio": 0.0,
"completion_length": 538.5,
"epoch": 80.88888888888889,
"grad_norm": 1.646427054544714,
"kl": 0.063232421875,
"learning_rate": 9.549150281252632e-08,
"loss": -0.0039,
"reward": 4.15625,
"reward_std": 2.2053900957107544,
"rewards/accuracy_reward_staging": 0.4375,
"rewards/format_reward": 0.96875,
"rewards/format_reward_staging": 1.0,
"step": 324
},
{
"clip_ratio": 0.0,
"completion_length": 562.03125,
"epoch": 81.22222222222223,
"grad_norm": 1.2401904323679076,
"kl": 0.059814453125,
"learning_rate": 9.307564136490254e-08,
"loss": 0.0337,
"reward": 2.6875,
"reward_std": 1.4073790609836578,
"rewards/accuracy_reward_staging": 0.15625,
"rewards/format_reward": 0.90625,
"rewards/format_reward_staging": 1.0,
"step": 325
},
{
"clip_ratio": 0.0,
"completion_length": 593.3125,
"epoch": 81.44444444444444,
"grad_norm": 1.5857599353606664,
"kl": 0.0521240234375,
"learning_rate": 9.068759265665382e-08,
"loss": 0.0031,
"reward": 3.46875,
"reward_std": 2.041439712047577,
"rewards/accuracy_reward_staging": 0.3125,
"rewards/format_reward": 0.9375,
"rewards/format_reward_staging": 0.96875,
"step": 326
},
{
"clip_ratio": 0.0,
"completion_length": 636.15625,
"epoch": 81.66666666666667,
"grad_norm": 1.464039724178544,
"kl": 0.04815673828125,
"learning_rate": 8.832751990773712e-08,
"loss": -0.033,
"reward": 4.0625,
"reward_std": 2.3850997388362885,
"rewards/accuracy_reward_staging": 0.4375,
"rewards/format_reward": 0.9375,
"rewards/format_reward_staging": 0.9375,
"step": 327
},
{
"clip_ratio": 0.0,
"completion_length": 550.625,
"epoch": 81.88888888888889,
"grad_norm": 1.546052526056493,
"kl": 0.05743408203125,
"learning_rate": 8.599558442598998e-08,
"loss": 0.0427,
"reward": 4.15625,
"reward_std": 2.8091025352478027,
"rewards/accuracy_reward_staging": 0.5,
"rewards/format_reward": 0.78125,
"rewards/format_reward_staging": 0.875,
"step": 328
},
{
"clip_ratio": 0.0,
"completion_length": 592.53125,
"epoch": 82.22222222222223,
"grad_norm": 1.359491111417973,
"kl": 0.05621337890625,
"learning_rate": 8.369194559610481e-08,
"loss": 0.0752,
"reward": 3.03125,
"reward_std": 1.4954701960086823,
"rewards/accuracy_reward_staging": 0.25,
"rewards/format_reward": 0.875,
"rewards/format_reward_staging": 0.90625,
"step": 329
},
{
"clip_ratio": 0.0,
"completion_length": 608.78125,
"epoch": 82.44444444444444,
"grad_norm": 1.4795564527051304,
"kl": 0.05267333984375,
"learning_rate": 8.141676086873573e-08,
"loss": 0.0759,
"reward": 3.28125,
"reward_std": 1.9649099707603455,
"rewards/accuracy_reward_staging": 0.28125,
"rewards/format_reward": 0.9375,
"rewards/format_reward_staging": 0.9375,
"step": 330
},
{
"clip_ratio": 0.0,
"completion_length": 583.0625,
"epoch": 82.66666666666667,
"grad_norm": 1.6241400838987166,
"kl": 0.0562744140625,
"learning_rate": 7.917018574973644e-08,
"loss": 0.0196,
"reward": 4.40625,
"reward_std": 2.2960872054100037,
"rewards/accuracy_reward_staging": 0.53125,
"rewards/format_reward": 0.78125,
"rewards/format_reward_staging": 0.96875,
"step": 331
},
{
"clip_ratio": 0.0,
"completion_length": 553.5,
"epoch": 82.88888888888889,
"grad_norm": 1.5339258798115873,
"kl": 0.0474853515625,
"learning_rate": 7.695237378953224e-08,
"loss": -0.0209,
"reward": 4.5,
"reward_std": 2.332531690597534,
"rewards/accuracy_reward_staging": 0.5,
"rewards/format_reward": 1.0,
"rewards/format_reward_staging": 1.0,
"step": 332
},
{
"clip_ratio": 0.0,
"completion_length": 588.34375,
"epoch": 83.22222222222223,
"grad_norm": 1.5601324006274968,
"kl": 0.05804443359375,
"learning_rate": 7.476347657262455e-08,
"loss": -0.039,
"reward": 4.71875,
"reward_std": 2.439529001712799,
"rewards/accuracy_reward_staging": 0.5625,
"rewards/format_reward": 0.9375,
"rewards/format_reward_staging": 0.96875,
"step": 333
},
{
"clip_ratio": 0.0,
"completion_length": 558.25,
"epoch": 83.44444444444444,
"grad_norm": 1.5625484764568953,
"kl": 0.05841064453125,
"learning_rate": 7.260364370723043e-08,
"loss": -0.0022,
"reward": 3.875,
"reward_std": 2.6049662828445435,
"rewards/accuracy_reward_staging": 0.40625,
"rewards/format_reward": 0.875,
"rewards/format_reward_staging": 0.96875,
"step": 334
},
{
"clip_ratio": 0.0,
"completion_length": 538.53125,
"epoch": 83.66666666666667,
"grad_norm": 1.6578933804792892,
"kl": 0.064697265625,
"learning_rate": 7.047302281505735e-08,
"loss": 0.0178,
"reward": 3.6875,
"reward_std": 1.93262779712677,
"rewards/accuracy_reward_staging": 0.34375,
"rewards/format_reward": 0.96875,
"rewards/format_reward_staging": 1.0,
"step": 335
},
{
"clip_ratio": 0.0,
"completion_length": 579.3125,
"epoch": 83.88888888888889,
"grad_norm": 1.737744709396854,
"kl": 0.05303955078125,
"learning_rate": 6.837175952121304e-08,
"loss": -0.056,
"reward": 3.875,
"reward_std": 2.5176164507865906,
"rewards/accuracy_reward_staging": 0.40625,
"rewards/format_reward": 0.875,
"rewards/format_reward_staging": 0.96875,
"step": 336
},
{
"clip_ratio": 0.0,
"completion_length": 559.25,
"epoch": 84.22222222222223,
"grad_norm": 1.5777065212148367,
"kl": 0.0582275390625,
"learning_rate": 6.629999744425235e-08,
"loss": -0.0542,
"reward": 3.3125,
"reward_std": 1.8360159397125244,
"rewards/accuracy_reward_staging": 0.3125,
"rewards/format_reward": 0.84375,
"rewards/format_reward_staging": 0.90625,
"step": 337
},
{
"clip_ratio": 0.0,
"completion_length": 604.03125,
"epoch": 84.44444444444444,
"grad_norm": 1.7153117337295096,
"kl": 0.05419921875,
"learning_rate": 6.42578781863613e-08,
"loss": 0.0782,
"reward": 3.625,
"reward_std": 3.0492074489593506,
"rewards/accuracy_reward_staging": 0.375,
"rewards/format_reward": 0.84375,
"rewards/format_reward_staging": 0.90625,
"step": 338
},
{
"clip_ratio": 0.0,
"completion_length": 591.3125,
"epoch": 84.66666666666667,
"grad_norm": 1.3347688495066687,
"kl": 0.053466796875,
"learning_rate": 6.22455413236786e-08,
"loss": -0.0014,
"reward": 3.03125,
"reward_std": 1.389709249138832,
"rewards/accuracy_reward_staging": 0.25,
"rewards/format_reward": 0.875,
"rewards/format_reward_staging": 0.90625,
"step": 339
},
{
"clip_ratio": 0.0,
"completion_length": 633.6875,
"epoch": 84.88888888888889,
"grad_norm": 1.389754239090955,
"kl": 0.04827880859375,
"learning_rate": 6.026312439675551e-08,
"loss": 0.0256,
"reward": 4.25,
"reward_std": 2.171033263206482,
"rewards/accuracy_reward_staging": 0.46875,
"rewards/format_reward": 0.9375,
"rewards/format_reward_staging": 0.96875,
"step": 340
},
{
"clip_ratio": 0.0,
"completion_length": 550.59375,
"epoch": 85.22222222222223,
"grad_norm": 1.7000896266286212,
"kl": 0.06982421875,
"learning_rate": 5.831076290115572e-08,
"loss": 0.0243,
"reward": 4.15625,
"reward_std": 2.343973159790039,
"rewards/accuracy_reward_staging": 0.46875,
"rewards/format_reward": 0.84375,
"rewards/format_reward_staging": 0.96875,
"step": 341
},
{
"clip_ratio": 0.0,
"completion_length": 600.8125,
"epoch": 85.44444444444444,
"grad_norm": 1.4833240972495387,
"kl": 0.056884765625,
"learning_rate": 5.638859027819409e-08,
"loss": 0.0553,
"reward": 3.53125,
"reward_std": 2.29950013756752,
"rewards/accuracy_reward_staging": 0.34375,
"rewards/format_reward": 0.875,
"rewards/format_reward_staging": 0.9375,
"step": 342
},
{
"clip_ratio": 0.0,
"completion_length": 650.875,
"epoch": 85.66666666666667,
"grad_norm": 1.3618290470634058,
"kl": 0.04998779296875,
"learning_rate": 5.44967379058161e-08,
"loss": -0.0017,
"reward": 5.0,
"reward_std": 2.3323360979557037,
"rewards/accuracy_reward_staging": 0.625,
"rewards/format_reward": 0.9375,
"rewards/format_reward_staging": 0.9375,
"step": 343
},
{
"clip_ratio": 0.0,
"completion_length": 576.75,
"epoch": 85.88888888888889,
"grad_norm": 1.749186308713559,
"kl": 0.05303955078125,
"learning_rate": 5.263533508961826e-08,
"loss": 0.0794,
"reward": 3.34375,
"reward_std": 2.082039564847946,
"rewards/accuracy_reward_staging": 0.3125,
"rewards/format_reward": 0.84375,
"rewards/format_reward_staging": 0.9375,
"step": 344
},
{
"clip_ratio": 0.0,
"completion_length": 573.3125,
"epoch": 86.22222222222223,
"grad_norm": 1.4322968854479468,
"kl": 0.05615234375,
"learning_rate": 5.080450905401057e-08,
"loss": 0.0153,
"reward": 4.25,
"reward_std": 1.8755539804697037,
"rewards/accuracy_reward_staging": 0.46875,
"rewards/format_reward": 0.9375,
"rewards/format_reward_staging": 0.96875,
"step": 345
},
{
"clip_ratio": 0.0,
"completion_length": 546.59375,
"epoch": 86.44444444444444,
"grad_norm": 1.2682803454826486,
"kl": 0.053955078125,
"learning_rate": 4.9004384933520547e-08,
"loss": 0.0083,
"reward": 3.53125,
"reward_std": 1.3726893961429596,
"rewards/accuracy_reward_staging": 0.3125,
"rewards/format_reward": 0.96875,
"rewards/format_reward_staging": 1.0,
"step": 346
},
{
"clip_ratio": 0.0,
"completion_length": 580.21875,
"epoch": 86.66666666666667,
"grad_norm": 1.5693179766123742,
"kl": 0.05389404296875,
"learning_rate": 4.723508576424062e-08,
"loss": -0.0063,
"reward": 3.46875,
"reward_std": 2.777799040079117,
"rewards/accuracy_reward_staging": 0.34375,
"rewards/format_reward": 0.84375,
"rewards/format_reward_staging": 0.90625,
"step": 347
},
{
"clip_ratio": 0.0,
"completion_length": 595.375,
"epoch": 86.88888888888889,
"grad_norm": 1.6314212035379032,
"kl": 0.053955078125,
"learning_rate": 4.549673247541874e-08,
"loss": -0.01,
"reward": 4.15625,
"reward_std": 2.311874210834503,
"rewards/accuracy_reward_staging": 0.46875,
"rewards/format_reward": 0.875,
"rewards/format_reward_staging": 0.9375,
"step": 348
},
{
"clip_ratio": 0.0,
"completion_length": 568.71875,
"epoch": 87.22222222222223,
"grad_norm": 1.658360075079596,
"kl": 0.0572509765625,
"learning_rate": 4.37894438811931e-08,
"loss": 0.0064,
"reward": 3.5625,
"reward_std": 2.875,
"rewards/accuracy_reward_staging": 0.34375,
"rewards/format_reward": 0.84375,
"rewards/format_reward_staging": 1.0,
"step": 349
},
{
"epoch": 87.44444444444444,
"grad_norm": 1.4586421894209247,
"learning_rate": 4.2113336672471245e-08,
"loss": 0.0579,
"step": 350
},
{
"epoch": 87.44444444444444,
"eval_clip_ratio": 0.0,
"eval_completion_length": 609.05,
"eval_kl": 0.052783203125,
"eval_loss": 0.033656854182481766,
"eval_reward": 2.45,
"eval_reward_std": 1.6229771614074706,
"eval_rewards/accuracy_reward_staging": 0.15,
"eval_rewards/format_reward": 0.825,
"eval_rewards/format_reward_staging": 0.875,
"eval_runtime": 55.2193,
"eval_samples_per_second": 0.652,
"eval_steps_per_second": 0.091,
"step": 350
},
{
"clip_ratio": 0.0,
"completion_length": 609.578125,
"epoch": 87.66666666666667,
"grad_norm": 1.6263684105864271,
"kl": 0.0521240234375,
"learning_rate": 4.0468525408954456e-08,
"loss": 0.0832,
"reward": 4.265625,
"reward_std": 2.632143199443817,
"rewards/accuracy_reward_staging": 0.484375,
"rewards/format_reward": 0.90625,
"rewards/format_reward_staging": 0.9375,
"step": 351
},
{
"clip_ratio": 0.0,
"completion_length": 630.0,
"epoch": 87.88888888888889,
"grad_norm": 1.406310532139818,
"kl": 0.0509033203125,
"learning_rate": 3.8855122511307626e-08,
"loss": 0.0517,
"reward": 3.0625,
"reward_std": 1.3608438968658447,
"rewards/accuracy_reward_staging": 0.21875,
"rewards/format_reward": 0.96875,
"rewards/format_reward_staging": 1.0,
"step": 352
},
{
"clip_ratio": 0.0,
"completion_length": 592.46875,
"epoch": 88.22222222222223,
"grad_norm": 1.505022624138408,
"kl": 0.05255126953125,
"learning_rate": 3.727323825347578e-08,
"loss": 0.0469,
"reward": 4.25,
"reward_std": 2.023455113172531,
"rewards/accuracy_reward_staging": 0.46875,
"rewards/format_reward": 0.9375,
"rewards/format_reward_staging": 0.96875,
"step": 353
},
{
"clip_ratio": 0.0,
"completion_length": 570.21875,
"epoch": 88.44444444444444,
"grad_norm": 1.7398525080011868,
"kl": 0.051025390625,
"learning_rate": 3.572298075514652e-08,
"loss": 0.0079,
"reward": 5.25,
"reward_std": 2.496154248714447,
"rewards/accuracy_reward_staging": 0.65625,
"rewards/format_reward": 1.0,
"rewards/format_reward_staging": 0.96875,
"step": 354
},
{
"clip_ratio": 0.0,
"completion_length": 609.75,
"epoch": 88.66666666666667,
"grad_norm": 1.4985931737207225,
"kl": 0.05255126953125,
"learning_rate": 3.420445597436056e-08,
"loss": 0.0262,
"reward": 4.09375,
"reward_std": 2.1352776885032654,
"rewards/accuracy_reward_staging": 0.4375,
"rewards/format_reward": 0.9375,
"rewards/format_reward_staging": 0.96875,
"step": 355
},
{
"clip_ratio": 0.0,
"completion_length": 673.75,
"epoch": 88.88888888888889,
"grad_norm": 1.4906376909879282,
"kl": 0.05889892578125,
"learning_rate": 3.271776770026963e-08,
"loss": 0.0716,
"reward": 3.28125,
"reward_std": 2.086387515068054,
"rewards/accuracy_reward_staging": 0.3125,
"rewards/format_reward": 0.78125,
"rewards/format_reward_staging": 0.9375,
"step": 356
},
{
"clip_ratio": 0.0,
"completion_length": 549.53125,
"epoch": 89.22222222222223,
"grad_norm": 1.87310658244872,
"kl": 0.08197021484375,
"learning_rate": 3.1263017546042326e-08,
"loss": 0.0395,
"reward": 3.90625,
"reward_std": 2.444858193397522,
"rewards/accuracy_reward_staging": 0.40625,
"rewards/format_reward": 0.875,
"rewards/format_reward_staging": 1.0,
"step": 357
},
{
"clip_ratio": 0.0,
"completion_length": 570.34375,
"epoch": 89.44444444444444,
"grad_norm": 1.6047992153607105,
"kl": 0.05328369140625,
"learning_rate": 2.9840304941919416e-08,
"loss": 0.0128,
"reward": 4.09375,
"reward_std": 3.0098507404327393,
"rewards/accuracy_reward_staging": 0.4375,
"rewards/format_reward": 0.9375,
"rewards/format_reward_staging": 0.96875,
"step": 358
},
{
"clip_ratio": 0.0,
"completion_length": 588.09375,
"epoch": 89.66666666666667,
"grad_norm": 1.3977481210272724,
"kl": 0.0628662109375,
"learning_rate": 2.8449727128417367e-08,
"loss": 0.0184,
"reward": 3.6875,
"reward_std": 1.197430670261383,
"rewards/accuracy_reward_staging": 0.40625,
"rewards/format_reward": 0.78125,
"rewards/format_reward_staging": 0.875,
"step": 359
},
{
"clip_ratio": 0.0,
"completion_length": 663.0625,
"epoch": 89.88888888888889,
"grad_norm": 1.521856541607207,
"kl": 0.04925537109375,
"learning_rate": 2.7091379149682682e-08,
"loss": -0.0485,
"reward": 4.34375,
"reward_std": 2.930923640727997,
"rewards/accuracy_reward_staging": 0.46875,
"rewards/format_reward": 1.0,
"rewards/format_reward_staging": 1.0,
"step": 360
},
{
"clip_ratio": 0.0,
"completion_length": 585.0625,
"epoch": 90.22222222222223,
"grad_norm": 1.8447806012116532,
"kl": 0.05377197265625,
"learning_rate": 2.5765353846995297e-08,
"loss": 0.049,
"reward": 4.53125,
"reward_std": 3.1164740920066833,
"rewards/accuracy_reward_staging": 0.5625,
"rewards/format_reward": 0.8125,
"rewards/format_reward_staging": 0.90625,
"step": 361
},
{
"clip_ratio": 0.0,
"completion_length": 650.21875,
"epoch": 90.44444444444444,
"grad_norm": 1.184344841195356,
"kl": 0.04827880859375,
"learning_rate": 2.4471741852423233e-08,
"loss": 0.0432,
"reward": 2.96875,
"reward_std": 1.5483438968658447,
"rewards/accuracy_reward_staging": 0.21875,
"rewards/format_reward": 0.9375,
"rewards/format_reward_staging": 0.9375,
"step": 362
},
{
"clip_ratio": 0.0,
"completion_length": 547.875,
"epoch": 90.66666666666667,
"grad_norm": 1.4816241418571312,
"kl": 0.065185546875,
"learning_rate": 2.3210631582627927e-08,
"loss": -0.007,
"reward": 4.53125,
"reward_std": 2.563981920480728,
"rewards/accuracy_reward_staging": 0.53125,
"rewards/format_reward": 0.90625,
"rewards/format_reward_staging": 0.96875,
"step": 363
},
{
"clip_ratio": 0.0,
"completion_length": 646.875,
"epoch": 90.88888888888889,
"grad_norm": 1.4324587884354845,
"kl": 0.0557861328125,
"learning_rate": 2.1982109232821176e-08,
"loss": 0.0456,
"reward": 4.6875,
"reward_std": 2.595020294189453,
"rewards/accuracy_reward_staging": 0.5625,
"rewards/format_reward": 0.90625,
"rewards/format_reward_staging": 0.96875,
"step": 364
},
{
"clip_ratio": 0.0,
"completion_length": 609.875,
"epoch": 91.22222222222223,
"grad_norm": 1.406708270283001,
"kl": 0.04571533203125,
"learning_rate": 2.0786258770873645e-08,
"loss": -0.0323,
"reward": 5.28125,
"reward_std": 2.358702301979065,
"rewards/accuracy_reward_staging": 0.65625,
"rewards/format_reward": 1.0,
"rewards/format_reward_staging": 1.0,
"step": 365
},
{
"clip_ratio": 0.0,
"completion_length": 625.96875,
"epoch": 91.44444444444444,
"grad_norm": 1.254136427080868,
"kl": 0.0477294921875,
"learning_rate": 1.9623161931575926e-08,
"loss": 0.0391,
"reward": 4.25,
"reward_std": 1.3912444412708282,
"rewards/accuracy_reward_staging": 0.46875,
"rewards/format_reward": 0.9375,
"rewards/format_reward_staging": 0.96875,
"step": 366
},
{
"clip_ratio": 0.0,
"completion_length": 584.8125,
"epoch": 91.66666666666667,
"grad_norm": 1.4411232733747643,
"kl": 0.057861328125,
"learning_rate": 1.849289821105199e-08,
"loss": 0.0171,
"reward": 3.125,
"reward_std": 1.5756275057792664,
"rewards/accuracy_reward_staging": 0.25,
"rewards/format_reward": 0.9375,
"rewards/format_reward_staging": 0.9375,
"step": 367
},
{
"clip_ratio": 0.0,
"completion_length": 629.1875,
"epoch": 91.88888888888889,
"grad_norm": 1.1371389339839404,
"kl": 0.051513671875,
"learning_rate": 1.7395544861325718e-08,
"loss": 0.011,
"reward": 3.53125,
"reward_std": 1.816932737827301,
"rewards/accuracy_reward_staging": 0.3125,
"rewards/format_reward": 0.96875,
"rewards/format_reward_staging": 1.0,
"step": 368
},
{
"clip_ratio": 0.0,
"completion_length": 615.21875,
"epoch": 92.22222222222223,
"grad_norm": 1.2832104145352503,
"kl": 0.046142578125,
"learning_rate": 1.6331176885040876e-08,
"loss": 0.0567,
"reward": 3.78125,
"reward_std": 1.9511407613754272,
"rewards/accuracy_reward_staging": 0.375,
"rewards/format_reward": 0.9375,
"rewards/format_reward_staging": 0.96875,
"step": 369
},
{
"clip_ratio": 0.0,
"completion_length": 561.375,
"epoch": 92.44444444444444,
"grad_norm": 1.4397679570391773,
"kl": 0.05340576171875,
"learning_rate": 1.5299867030334813e-08,
"loss": 0.0089,
"reward": 3.1875,
"reward_std": 1.2975594997406006,
"rewards/accuracy_reward_staging": 0.25,
"rewards/format_reward": 0.9375,
"rewards/format_reward_staging": 1.0,
"step": 370
},
{
"clip_ratio": 0.0,
"completion_length": 547.0625,
"epoch": 92.66666666666667,
"grad_norm": 1.5669540097130739,
"kl": 0.066650390625,
"learning_rate": 1.4301685785866213e-08,
"loss": -0.0198,
"reward": 4.46875,
"reward_std": 2.9167675375938416,
"rewards/accuracy_reward_staging": 0.5,
"rewards/format_reward": 0.96875,
"rewards/format_reward_staging": 1.0,
"step": 371
},
{
"clip_ratio": 0.0,
"completion_length": 588.1875,
"epoch": 92.88888888888889,
"grad_norm": 1.6359800713030668,
"kl": 0.05194091796875,
"learning_rate": 1.3336701375997127e-08,
"loss": 0.0226,
"reward": 4.1875,
"reward_std": 2.957531690597534,
"rewards/accuracy_reward_staging": 0.4375,
"rewards/format_reward": 1.0,
"rewards/format_reward_staging": 1.0,
"step": 372
},
{
"clip_ratio": 0.0,
"completion_length": 596.375,
"epoch": 93.22222222222223,
"grad_norm": 22.007037588121534,
"kl": 0.2752685546875,
"learning_rate": 1.240497975613014e-08,
"loss": -0.0325,
"reward": 3.75,
"reward_std": 1.8229495882987976,
"rewards/accuracy_reward_staging": 0.40625,
"rewards/format_reward": 0.8125,
"rewards/format_reward_staging": 0.90625,
"step": 373
},
{
"clip_ratio": 0.0,
"completion_length": 554.75,
"epoch": 93.44444444444444,
"grad_norm": 1.5726784994170007,
"kl": 0.05316162109375,
"learning_rate": 1.1506584608200364e-08,
"loss": 0.0904,
"reward": 2.75,
"reward_std": 1.680722177028656,
"rewards/accuracy_reward_staging": 0.1875,
"rewards/format_reward": 0.90625,
"rewards/format_reward_staging": 0.90625,
"step": 374
},
{
"clip_ratio": 0.0,
"completion_length": 597.53125,
"epoch": 93.66666666666667,
"grad_norm": 1.2719220301062524,
"kl": 0.05828857421875,
"learning_rate": 1.0641577336322761e-08,
"loss": 0.0199,
"reward": 4.96875,
"reward_std": 2.2166852056980133,
"rewards/accuracy_reward_staging": 0.625,
"rewards/format_reward": 0.875,
"rewards/format_reward_staging": 0.96875,
"step": 375
},
{
"clip_ratio": 0.0,
"completion_length": 640.1875,
"epoch": 93.88888888888889,
"grad_norm": 2.653449397011027,
"kl": 0.07366943359375,
"learning_rate": 9.810017062595321e-09,
"loss": 0.0336,
"reward": 4.0,
"reward_std": 2.329674154520035,
"rewards/accuracy_reward_staging": 0.4375,
"rewards/format_reward": 0.875,
"rewards/format_reward_staging": 0.9375,
"step": 376
},
{
"clip_ratio": 0.0,
"completion_length": 511.21875,
"epoch": 94.22222222222223,
"grad_norm": 1.4821827826071337,
"kl": 0.04779052734375,
"learning_rate": 9.011960623058201e-09,
"loss": -0.0241,
"reward": 4.53125,
"reward_std": 1.9632892608642578,
"rewards/accuracy_reward_staging": 0.53125,
"rewards/format_reward": 0.90625,
"rewards/format_reward_staging": 0.96875,
"step": 377
},
{
"clip_ratio": 0.0,
"completion_length": 516.34375,
"epoch": 94.44444444444444,
"grad_norm": 1.424328758939937,
"kl": 0.055419921875,
"learning_rate": 8.247462563808816e-09,
"loss": 0.018,
"reward": 4.46875,
"reward_std": 2.4695461988449097,
"rewards/accuracy_reward_staging": 0.5,
"rewards/format_reward": 0.96875,
"rewards/format_reward_staging": 1.0,
"step": 378
},
{
"clip_ratio": 0.0,
"completion_length": 630.75,
"epoch": 94.66666666666667,
"grad_norm": 1.3627094949939382,
"kl": 0.05291748046875,
"learning_rate": 7.516575137274162e-09,
"loss": 0.05,
"reward": 3.9375,
"reward_std": 2.0698782801628113,
"rewards/accuracy_reward_staging": 0.40625,
"rewards/format_reward": 0.9375,
"rewards/format_reward_staging": 0.96875,
"step": 379
},
{
"clip_ratio": 0.0,
"completion_length": 632.625,
"epoch": 94.88888888888889,
"grad_norm": 1.1699961416103346,
"kl": 0.05120849609375,
"learning_rate": 6.819348298638839e-09,
"loss": 0.0182,
"reward": 3.1875,
"reward_std": 2.152123808860779,
"rewards/accuracy_reward_staging": 0.25,
"rewards/format_reward": 0.96875,
"rewards/format_reward_staging": 0.96875,
"step": 380
},
{
"clip_ratio": 0.0,
"completion_length": 618.8125,
"epoch": 95.22222222222223,
"grad_norm": 1.362463533881549,
"kl": 0.06512451171875,
"learning_rate": 6.15582970243117e-09,
"loss": 0.0677,
"reward": 3.375,
"reward_std": 1.7910222113132477,
"rewards/accuracy_reward_staging": 0.3125,
"rewards/format_reward": 0.875,
"rewards/format_reward_staging": 0.9375,
"step": 381
},
{
"clip_ratio": 0.0,
"completion_length": 551.15625,
"epoch": 95.44444444444444,
"grad_norm": 1.297999563247604,
"kl": 0.06048583984375,
"learning_rate": 5.526064699265753e-09,
"loss": 0.0032,
"reward": 3.96875,
"reward_std": 1.8312554359436035,
"rewards/accuracy_reward_staging": 0.40625,
"rewards/format_reward": 0.96875,
"rewards/format_reward_staging": 0.96875,
"step": 382
},
{
"clip_ratio": 0.0,
"completion_length": 604.625,
"epoch": 95.66666666666667,
"grad_norm": 1.486555198053525,
"kl": 0.05755615234375,
"learning_rate": 4.9300963327441044e-09,
"loss": 0.043,
"reward": 4.40625,
"reward_std": 3.2432121634483337,
"rewards/accuracy_reward_staging": 0.5,
"rewards/format_reward": 0.9375,
"rewards/format_reward_staging": 0.96875,
"step": 383
},
{
"clip_ratio": 0.0,
"completion_length": 583.875,
"epoch": 95.88888888888889,
"grad_norm": 1.5255994079961044,
"kl": 0.05120849609375,
"learning_rate": 4.367965336512403e-09,
"loss": -0.0079,
"reward": 3.8125,
"reward_std": 1.9035333096981049,
"rewards/accuracy_reward_staging": 0.375,
"rewards/format_reward": 0.9375,
"rewards/format_reward_staging": 1.0,
"step": 384
},
{
"clip_ratio": 0.0,
"completion_length": 500.53125,
"epoch": 96.22222222222223,
"grad_norm": 1.3113670847804533,
"kl": 0.05438232421875,
"learning_rate": 3.8397101314774915e-09,
"loss": -0.0184,
"reward": 3.28125,
"reward_std": 1.0818375647068024,
"rewards/accuracy_reward_staging": 0.28125,
"rewards/format_reward": 0.9375,
"rewards/format_reward_staging": 0.9375,
"step": 385
},
{
"clip_ratio": 0.0,
"completion_length": 554.8125,
"epoch": 96.44444444444444,
"grad_norm": 1.7546175198232294,
"kl": 0.056640625,
"learning_rate": 3.3453668231809283e-09,
"loss": -0.0321,
"reward": 5.5,
"reward_std": 3.5806562304496765,
"rewards/accuracy_reward_staging": 0.71875,
"rewards/format_reward": 0.9375,
"rewards/format_reward_staging": 0.96875,
"step": 386
},
{
"clip_ratio": 0.0,
"completion_length": 632.9375,
"epoch": 96.66666666666667,
"grad_norm": 1.3830980489663667,
"kl": 0.050537109375,
"learning_rate": 2.8849691993311777e-09,
"loss": 0.0483,
"reward": 3.59375,
"reward_std": 2.2675071954727173,
"rewards/accuracy_reward_staging": 0.34375,
"rewards/format_reward": 0.90625,
"rewards/format_reward_staging": 0.96875,
"step": 387
},
{
"clip_ratio": 0.0,
"completion_length": 565.0,
"epoch": 96.88888888888889,
"grad_norm": 1.6187360157512092,
"kl": 0.0645751953125,
"learning_rate": 2.458548727494292e-09,
"loss": 0.0672,
"reward": 4.1875,
"reward_std": 2.5254639387130737,
"rewards/accuracy_reward_staging": 0.5,
"rewards/format_reward": 0.78125,
"rewards/format_reward_staging": 0.90625,
"step": 388
},
{
"clip_ratio": 0.0,
"completion_length": 601.3125,
"epoch": 97.22222222222223,
"grad_norm": 1.513261818035226,
"kl": 0.05316162109375,
"learning_rate": 2.066134552943077e-09,
"loss": -0.054,
"reward": 4.0,
"reward_std": 2.443375587463379,
"rewards/accuracy_reward_staging": 0.40625,
"rewards/format_reward": 0.96875,
"rewards/format_reward_staging": 1.0,
"step": 389
},
{
"clip_ratio": 0.0,
"completion_length": 618.53125,
"epoch": 97.44444444444444,
"grad_norm": 1.2736528121828654,
"kl": 0.04791259765625,
"learning_rate": 1.7077534966650765e-09,
"loss": 0.0219,
"reward": 5.375,
"reward_std": 2.514360010623932,
"rewards/accuracy_reward_staging": 0.6875,
"rewards/format_reward": 0.9375,
"rewards/format_reward_staging": 1.0,
"step": 390
},
{
"clip_ratio": 0.0,
"completion_length": 593.1875,
"epoch": 97.66666666666667,
"grad_norm": 1.5869628592439236,
"kl": 0.0782470703125,
"learning_rate": 1.383430053529422e-09,
"loss": -0.01,
"reward": 3.40625,
"reward_std": 1.3685379922389984,
"rewards/accuracy_reward_staging": 0.3125,
"rewards/format_reward": 0.90625,
"rewards/format_reward_staging": 0.9375,
"step": 391
},
{
"clip_ratio": 0.0,
"completion_length": 646.5,
"epoch": 97.88888888888889,
"grad_norm": 1.3206401873227125,
"kl": 0.052490234375,
"learning_rate": 1.0931863906127325e-09,
"loss": -0.0253,
"reward": 3.65625,
"reward_std": 1.6694981455802917,
"rewards/accuracy_reward_staging": 0.34375,
"rewards/format_reward": 0.96875,
"rewards/format_reward_staging": 0.96875,
"step": 392
},
{
"clip_ratio": 0.0,
"completion_length": 573.78125,
"epoch": 98.22222222222223,
"grad_norm": 1.8394528163274233,
"kl": 0.0552978515625,
"learning_rate": 8.370423456837139e-10,
"loss": 0.0136,
"reward": 4.625,
"reward_std": 2.260310411453247,
"rewards/accuracy_reward_staging": 0.5625,
"rewards/format_reward": 0.84375,
"rewards/format_reward_staging": 0.96875,
"step": 393
},
{
"clip_ratio": 0.0,
"completion_length": 621.53125,
"epoch": 98.44444444444444,
"grad_norm": 1.5558051348782826,
"kl": 0.06475830078125,
"learning_rate": 6.150154258476314e-10,
"loss": -0.0687,
"reward": 4.71875,
"reward_std": 2.5687596797943115,
"rewards/accuracy_reward_staging": 0.5625,
"rewards/format_reward": 0.96875,
"rewards/format_reward_staging": 0.9375,
"step": 394
},
{
"clip_ratio": 0.0,
"completion_length": 595.75,
"epoch": 98.66666666666667,
"grad_norm": 1.1453949481051802,
"kl": 0.048095703125,
"learning_rate": 4.271208063494902e-10,
"loss": -0.0004,
"reward": 2.5625,
"reward_std": 1.1108438968658447,
"rewards/accuracy_reward_staging": 0.125,
"rewards/format_reward": 0.96875,
"rewards/format_reward_staging": 0.96875,
"step": 395
},
{
"clip_ratio": 0.0,
"completion_length": 612.9375,
"epoch": 98.88888888888889,
"grad_norm": 1.627509813072313,
"kl": 0.05230712890625,
"learning_rate": 2.733713295369755e-10,
"loss": -0.0208,
"reward": 4.375,
"reward_std": 2.401917338371277,
"rewards/accuracy_reward_staging": 0.5,
"rewards/format_reward": 0.9375,
"rewards/format_reward_staging": 0.9375,
"step": 396
},
{
"clip_ratio": 0.0,
"completion_length": 598.03125,
"epoch": 99.22222222222223,
"grad_norm": 1.3542702485509543,
"kl": 0.0565185546875,
"learning_rate": 1.53777503982655e-10,
"loss": 0.0102,
"reward": 3.3125,
"reward_std": 2.002065122127533,
"rewards/accuracy_reward_staging": 0.28125,
"rewards/format_reward": 0.90625,
"rewards/format_reward_staging": 1.0,
"step": 397
},
{
"clip_ratio": 0.0,
"completion_length": 589.84375,
"epoch": 99.44444444444444,
"grad_norm": 1.524546929028841,
"kl": 0.06207275390625,
"learning_rate": 6.834750376549791e-11,
"loss": 0.0366,
"reward": 4.625,
"reward_std": 2.895161896944046,
"rewards/accuracy_reward_staging": 0.5625,
"rewards/format_reward": 0.875,
"rewards/format_reward_staging": 0.9375,
"step": 398
},
{
"clip_ratio": 0.0,
"completion_length": 563.6875,
"epoch": 99.66666666666667,
"grad_norm": 1.4232250672089828,
"kl": 0.05712890625,
"learning_rate": 1.7087167912710476e-11,
"loss": 0.0203,
"reward": 3.875,
"reward_std": 1.875,
"rewards/accuracy_reward_staging": 0.375,
"rewards/format_reward": 1.0,
"rewards/format_reward_staging": 1.0,
"step": 399
},
{
"epoch": 99.88888888888889,
"grad_norm": 1.8247807880781484,
"learning_rate": 0.0,
"loss": 0.0974,
"step": 400
},
{
"epoch": 99.88888888888889,
"eval_clip_ratio": 0.0,
"eval_completion_length": 644.175,
"eval_kl": 0.052978515625,
"eval_loss": 0.008646870031952858,
"eval_reward": 2.3,
"eval_reward_std": 1.2995877504348754,
"eval_rewards/accuracy_reward_staging": 0.125,
"eval_rewards/format_reward": 0.8,
"eval_rewards/format_reward_staging": 0.875,
"eval_runtime": 54.9436,
"eval_samples_per_second": 0.655,
"eval_steps_per_second": 0.091,
"step": 400
},
{
"clip_ratio": 0.0,
"completion_length": 498.375,
"epoch": 99.88888888888889,
"kl": 0.0625,
"reward": 3.28125,
"reward_std": 2.418270230293274,
"rewards/accuracy_reward_staging": 0.28125,
"rewards/format_reward": 0.9375,
"rewards/format_reward_staging": 0.9375,
"step": 400,
"total_flos": 0.0,
"train_loss": 0.01939338302021497,
"train_runtime": 14839.7642,
"train_samples_per_second": 0.243,
"train_steps_per_second": 0.027
}
],
"logging_steps": 1,
"max_steps": 400,
"num_input_tokens_seen": 0,
"num_train_epochs": 100,
"save_steps": 200,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}