eval_deficiency_ckpt1000_0827 / trainer_state.json
TobyYang7's picture
Upload folder using huggingface_hub
a23d0b4 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.0833333333333335,
"eval_steps": 500,
"global_step": 1000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio": 0.0,
"completion_length": 232.375,
"epoch": 0.0020833333333333333,
"grad_norm": 12.936853408813477,
"kl": 0.001018524169921875,
"learning_rate": 9.993055555555556e-07,
"loss": 0.0,
"reward": 0.21875,
"reward_std": 0.3930980935692787,
"rewards/accuracy_reward": 0.0625,
"rewards/format_reward": 0.15625,
"step": 1
},
{
"clip_ratio": 0.0,
"completion_length": 301.8125,
"epoch": 0.004166666666666667,
"grad_norm": 15.255448341369629,
"kl": 0.004360198974609375,
"learning_rate": 9.98611111111111e-07,
"loss": 0.0,
"reward": 0.34375,
"reward_std": 0.43536408245563507,
"rewards/accuracy_reward": 0.03125,
"rewards/format_reward": 0.3125,
"step": 2
},
{
"clip_ratio": 0.0,
"completion_length": 377.75,
"epoch": 0.00625,
"grad_norm": 16.691909790039062,
"kl": 0.0025482177734375,
"learning_rate": 9.979166666666667e-07,
"loss": 0.0,
"reward": 0.6875,
"reward_std": 0.5512787848711014,
"rewards/accuracy_reward": 0.0625,
"rewards/format_reward": 0.625,
"step": 3
},
{
"clip_ratio": 0.0,
"completion_length": 324.28125,
"epoch": 0.008333333333333333,
"grad_norm": 21.816268920898438,
"kl": 0.08805465698242188,
"learning_rate": 9.972222222222222e-07,
"loss": 0.0001,
"reward": 0.53125,
"reward_std": 0.564938560128212,
"rewards/accuracy_reward": 0.03125,
"rewards/format_reward": 0.5,
"step": 4
},
{
"clip_ratio": 0.0,
"completion_length": 414.09375,
"epoch": 0.010416666666666666,
"grad_norm": 10.746119499206543,
"kl": 0.003200531005859375,
"learning_rate": 9.965277777777778e-07,
"loss": 0.0,
"reward": 0.5625,
"reward_std": 0.49022960662841797,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.5625,
"step": 5
},
{
"clip_ratio": 0.0,
"completion_length": 443.3125,
"epoch": 0.0125,
"grad_norm": 3.8227508068084717,
"kl": 0.0056610107421875,
"learning_rate": 9.958333333333333e-07,
"loss": 0.0,
"reward": 0.875,
"reward_std": 0.49796397238969803,
"rewards/accuracy_reward": 0.0625,
"rewards/format_reward": 0.8125,
"step": 6
},
{
"clip_ratio": 0.0,
"completion_length": 457.0625,
"epoch": 0.014583333333333334,
"grad_norm": 3.433580160140991,
"kl": 0.00739288330078125,
"learning_rate": 9.95138888888889e-07,
"loss": 0.0,
"reward": 0.6875,
"reward_std": 0.49721167981624603,
"rewards/accuracy_reward": 0.0625,
"rewards/format_reward": 0.625,
"step": 7
},
{
"clip_ratio": 0.0,
"completion_length": 445.5625,
"epoch": 0.016666666666666666,
"grad_norm": 3.7514586448669434,
"kl": 0.00689697265625,
"learning_rate": 9.944444444444444e-07,
"loss": 0.0,
"reward": 0.71875,
"reward_std": 0.4397946000099182,
"rewards/accuracy_reward": 0.03125,
"rewards/format_reward": 0.6875,
"step": 8
},
{
"clip_ratio": 0.0,
"completion_length": 446.125,
"epoch": 0.01875,
"grad_norm": 1.7038488388061523,
"kl": 0.00711822509765625,
"learning_rate": 9.9375e-07,
"loss": 0.0,
"reward": 0.78125,
"reward_std": 0.3061639815568924,
"rewards/accuracy_reward": 0.03125,
"rewards/format_reward": 0.75,
"step": 9
},
{
"clip_ratio": 0.0,
"completion_length": 436.21875,
"epoch": 0.020833333333333332,
"grad_norm": 2.8775792121887207,
"kl": 0.00970458984375,
"learning_rate": 9.930555555555555e-07,
"loss": 0.0,
"reward": 0.875,
"reward_std": 0.4671337679028511,
"rewards/accuracy_reward": 0.09375,
"rewards/format_reward": 0.78125,
"step": 10
},
{
"clip_ratio": 0.0,
"completion_length": 445.90625,
"epoch": 0.022916666666666665,
"grad_norm": 2.9945576190948486,
"kl": 0.00738525390625,
"learning_rate": 9.923611111111111e-07,
"loss": 0.0,
"reward": 0.9375,
"reward_std": 0.5281829461455345,
"rewards/accuracy_reward": 0.125,
"rewards/format_reward": 0.8125,
"step": 11
},
{
"clip_ratio": 0.0,
"completion_length": 451.375,
"epoch": 0.025,
"grad_norm": 2.374281883239746,
"kl": 0.0085296630859375,
"learning_rate": 9.916666666666666e-07,
"loss": 0.0,
"reward": 1.09375,
"reward_std": 0.2630179077386856,
"rewards/accuracy_reward": 0.15625,
"rewards/format_reward": 0.9375,
"step": 12
},
{
"clip_ratio": 0.0,
"completion_length": 402.34375,
"epoch": 0.027083333333333334,
"grad_norm": 2.477187395095825,
"kl": 0.0110626220703125,
"learning_rate": 9.909722222222222e-07,
"loss": 0.0,
"reward": 1.03125,
"reward_std": 0.48461921513080597,
"rewards/accuracy_reward": 0.21875,
"rewards/format_reward": 0.8125,
"step": 13
},
{
"clip_ratio": 0.0,
"completion_length": 417.0625,
"epoch": 0.029166666666666667,
"grad_norm": 8.733186721801758,
"kl": 0.0125274658203125,
"learning_rate": 9.902777777777779e-07,
"loss": 0.0,
"reward": 0.875,
"reward_std": 0.408231720328331,
"rewards/accuracy_reward": 0.09375,
"rewards/format_reward": 0.78125,
"step": 14
},
{
"clip_ratio": 0.0,
"completion_length": 439.78125,
"epoch": 0.03125,
"grad_norm": 3.690876007080078,
"kl": 0.0147705078125,
"learning_rate": 9.895833333333333e-07,
"loss": 0.0,
"reward": 1.03125,
"reward_std": 0.5986681878566742,
"rewards/accuracy_reward": 0.3125,
"rewards/format_reward": 0.71875,
"step": 15
},
{
"clip_ratio": 0.0,
"completion_length": 478.28125,
"epoch": 0.03333333333333333,
"grad_norm": 3.3308470249176025,
"kl": 0.01483154296875,
"learning_rate": 9.88888888888889e-07,
"loss": 0.0,
"reward": 0.75,
"reward_std": 0.3535533845424652,
"rewards/accuracy_reward": 0.03125,
"rewards/format_reward": 0.71875,
"step": 16
},
{
"clip_ratio": 0.0,
"completion_length": 411.1875,
"epoch": 0.035416666666666666,
"grad_norm": 4.208990573883057,
"kl": 0.016632080078125,
"learning_rate": 9.881944444444444e-07,
"loss": 0.0,
"reward": 0.46875,
"reward_std": 0.3377464786171913,
"rewards/accuracy_reward": 0.0625,
"rewards/format_reward": 0.40625,
"step": 17
},
{
"clip_ratio": 0.0,
"completion_length": 488.0625,
"epoch": 0.0375,
"grad_norm": 3.331338405609131,
"kl": 0.01483154296875,
"learning_rate": 9.875e-07,
"loss": 0.0,
"reward": 1.21875,
"reward_std": 0.5347195863723755,
"rewards/accuracy_reward": 0.40625,
"rewards/format_reward": 0.8125,
"step": 18
},
{
"clip_ratio": 0.0,
"completion_length": 405.0,
"epoch": 0.03958333333333333,
"grad_norm": 1.6825522184371948,
"kl": 0.017303466796875,
"learning_rate": 9.868055555555555e-07,
"loss": 0.0,
"reward": 0.625,
"reward_std": 0.2314550280570984,
"rewards/accuracy_reward": 0.0625,
"rewards/format_reward": 0.5625,
"step": 19
},
{
"clip_ratio": 0.0,
"completion_length": 398.375,
"epoch": 0.041666666666666664,
"grad_norm": 4.040245532989502,
"kl": 0.0188140869140625,
"learning_rate": 9.861111111111112e-07,
"loss": 0.0,
"reward": 1.0,
"reward_std": 0.44403792917728424,
"rewards/accuracy_reward": 0.15625,
"rewards/format_reward": 0.84375,
"step": 20
},
{
"clip_ratio": 0.0,
"completion_length": 442.28125,
"epoch": 0.04375,
"grad_norm": 3.547840118408203,
"kl": 0.0157470703125,
"learning_rate": 9.854166666666666e-07,
"loss": 0.0,
"reward": 1.40625,
"reward_std": 0.4218914955854416,
"rewards/accuracy_reward": 0.40625,
"rewards/format_reward": 1.0,
"step": 21
},
{
"clip_ratio": 0.0,
"completion_length": 444.90625,
"epoch": 0.04583333333333333,
"grad_norm": 3.7362163066864014,
"kl": 0.0159454345703125,
"learning_rate": 9.847222222222223e-07,
"loss": 0.0,
"reward": 1.15625,
"reward_std": 0.3061639815568924,
"rewards/accuracy_reward": 0.15625,
"rewards/format_reward": 1.0,
"step": 22
},
{
"clip_ratio": 0.0,
"completion_length": 410.5,
"epoch": 0.04791666666666667,
"grad_norm": 3.4020462036132812,
"kl": 0.01849365234375,
"learning_rate": 9.840277777777777e-07,
"loss": 0.0,
"reward": 0.6875,
"reward_std": 0.408231720328331,
"rewards/accuracy_reward": 0.09375,
"rewards/format_reward": 0.59375,
"step": 23
},
{
"clip_ratio": 0.0,
"completion_length": 475.625,
"epoch": 0.05,
"grad_norm": 4.43328332901001,
"kl": 0.0170135498046875,
"learning_rate": 9.833333333333332e-07,
"loss": 0.0,
"reward": 1.125,
"reward_std": 0.3514062538743019,
"rewards/accuracy_reward": 0.15625,
"rewards/format_reward": 0.96875,
"step": 24
},
{
"clip_ratio": 0.0,
"completion_length": 436.0,
"epoch": 0.052083333333333336,
"grad_norm": 2.972919225692749,
"kl": 0.018707275390625,
"learning_rate": 9.826388888888888e-07,
"loss": 0.0,
"reward": 1.3125,
"reward_std": 0.4671337679028511,
"rewards/accuracy_reward": 0.34375,
"rewards/format_reward": 0.96875,
"step": 25
},
{
"clip_ratio": 0.0,
"completion_length": 410.0625,
"epoch": 0.05416666666666667,
"grad_norm": 6.743931293487549,
"kl": 0.0195770263671875,
"learning_rate": 9.819444444444443e-07,
"loss": 0.0,
"reward": 1.15625,
"reward_std": 0.4807935431599617,
"rewards/accuracy_reward": 0.375,
"rewards/format_reward": 0.78125,
"step": 26
},
{
"clip_ratio": 0.0,
"completion_length": 445.90625,
"epoch": 0.05625,
"grad_norm": 9.631500244140625,
"kl": 0.018890380859375,
"learning_rate": 9.8125e-07,
"loss": 0.0,
"reward": 0.875,
"reward_std": 0.3514062538743019,
"rewards/accuracy_reward": 0.125,
"rewards/format_reward": 0.75,
"step": 27
},
{
"clip_ratio": 0.0,
"completion_length": 411.375,
"epoch": 0.058333333333333334,
"grad_norm": 4.5614447593688965,
"kl": 0.019439697265625,
"learning_rate": 9.805555555555554e-07,
"loss": 0.0,
"reward": 1.28125,
"reward_std": 0.3471629247069359,
"rewards/accuracy_reward": 0.34375,
"rewards/format_reward": 0.9375,
"step": 28
},
{
"clip_ratio": 0.0,
"completion_length": 336.09375,
"epoch": 0.06041666666666667,
"grad_norm": 16.96288299560547,
"kl": 0.034393310546875,
"learning_rate": 9.79861111111111e-07,
"loss": 0.0,
"reward": 0.3125,
"reward_std": 0.408231720328331,
"rewards/accuracy_reward": 0.03125,
"rewards/format_reward": 0.28125,
"step": 29
},
{
"clip_ratio": 0.0,
"completion_length": 346.71875,
"epoch": 0.0625,
"grad_norm": 3.2910525798797607,
"kl": 0.025390625,
"learning_rate": 9.791666666666667e-07,
"loss": 0.0,
"reward": 1.375,
"reward_std": 0.5597654432058334,
"rewards/accuracy_reward": 0.4375,
"rewards/format_reward": 0.9375,
"step": 30
},
{
"clip_ratio": 0.0,
"completion_length": 401.65625,
"epoch": 0.06458333333333334,
"grad_norm": 5.341634750366211,
"kl": 0.02484130859375,
"learning_rate": 9.784722222222221e-07,
"loss": 0.0,
"reward": 0.96875,
"reward_std": 0.4218914955854416,
"rewards/accuracy_reward": 0.1875,
"rewards/format_reward": 0.78125,
"step": 31
},
{
"clip_ratio": 0.0,
"completion_length": 390.09375,
"epoch": 0.06666666666666667,
"grad_norm": 3.29443621635437,
"kl": 0.0223388671875,
"learning_rate": 9.777777777777778e-07,
"loss": 0.0,
"reward": 0.71875,
"reward_std": 0.2630179077386856,
"rewards/accuracy_reward": 0.21875,
"rewards/format_reward": 0.5,
"step": 32
},
{
"clip_ratio": 0.0,
"completion_length": 395.46875,
"epoch": 0.06875,
"grad_norm": 5.408890247344971,
"kl": 0.02655029296875,
"learning_rate": 9.770833333333332e-07,
"loss": 0.0,
"reward": 0.625,
"reward_std": 0.3514062538743019,
"rewards/accuracy_reward": 0.09375,
"rewards/format_reward": 0.53125,
"step": 33
},
{
"clip_ratio": 0.0,
"completion_length": 396.90625,
"epoch": 0.07083333333333333,
"grad_norm": 5.162864685058594,
"kl": 0.024810791015625,
"learning_rate": 9.763888888888889e-07,
"loss": 0.0,
"reward": 0.8125,
"reward_std": 0.5468482673168182,
"rewards/accuracy_reward": 0.21875,
"rewards/format_reward": 0.59375,
"step": 34
},
{
"clip_ratio": 0.0,
"completion_length": 378.0625,
"epoch": 0.07291666666666667,
"grad_norm": 2.051487684249878,
"kl": 0.0206298828125,
"learning_rate": 9.756944444444443e-07,
"loss": 0.0,
"reward": 0.9375,
"reward_std": 0.2896047830581665,
"rewards/accuracy_reward": 0.21875,
"rewards/format_reward": 0.71875,
"step": 35
},
{
"clip_ratio": 0.0,
"completion_length": 352.53125,
"epoch": 0.075,
"grad_norm": 2.547386407852173,
"kl": 0.025421142578125,
"learning_rate": 9.75e-07,
"loss": 0.0,
"reward": 0.375,
"reward_std": 0.27439429610967636,
"rewards/accuracy_reward": 0.125,
"rewards/format_reward": 0.25,
"step": 36
},
{
"clip_ratio": 0.0,
"completion_length": 343.65625,
"epoch": 0.07708333333333334,
"grad_norm": 2.0971977710723877,
"kl": 0.02703857421875,
"learning_rate": 9.743055555555554e-07,
"loss": 0.0,
"reward": 0.28125,
"reward_std": 0.22201896458864212,
"rewards/accuracy_reward": 0.03125,
"rewards/format_reward": 0.25,
"step": 37
},
{
"clip_ratio": 0.0,
"completion_length": 330.09375,
"epoch": 0.07916666666666666,
"grad_norm": 3.524477005004883,
"kl": 0.023406982421875,
"learning_rate": 9.73611111111111e-07,
"loss": 0.0,
"reward": 0.9375,
"reward_std": 0.4355512708425522,
"rewards/accuracy_reward": 0.125,
"rewards/format_reward": 0.8125,
"step": 38
},
{
"clip_ratio": 0.0,
"completion_length": 369.3125,
"epoch": 0.08125,
"grad_norm": 1.2645416259765625,
"kl": 0.024261474609375,
"learning_rate": 9.729166666666665e-07,
"loss": 0.0,
"reward": 1.0625,
"reward_std": 0.1157275140285492,
"rewards/accuracy_reward": 0.0625,
"rewards/format_reward": 1.0,
"step": 39
},
{
"clip_ratio": 0.0,
"completion_length": 347.3125,
"epoch": 0.08333333333333333,
"grad_norm": 5.557051658630371,
"kl": 0.029052734375,
"learning_rate": 9.722222222222222e-07,
"loss": 0.0,
"reward": 1.375,
"reward_std": 0.4765502139925957,
"rewards/accuracy_reward": 0.375,
"rewards/format_reward": 1.0,
"step": 40
},
{
"clip_ratio": 0.0,
"completion_length": 390.03125,
"epoch": 0.08541666666666667,
"grad_norm": 3.182037830352783,
"kl": 0.023590087890625,
"learning_rate": 9.715277777777776e-07,
"loss": 0.0,
"reward": 0.96875,
"reward_std": 0.4095756262540817,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.71875,
"step": 41
},
{
"clip_ratio": 0.0,
"completion_length": 339.8125,
"epoch": 0.0875,
"grad_norm": 9.412776947021484,
"kl": 0.02490234375,
"learning_rate": 9.708333333333333e-07,
"loss": 0.0,
"reward": 0.96875,
"reward_std": 0.3377464786171913,
"rewards/accuracy_reward": 0.15625,
"rewards/format_reward": 0.8125,
"step": 42
},
{
"clip_ratio": 0.0,
"completion_length": 364.1875,
"epoch": 0.08958333333333333,
"grad_norm": 19.119165420532227,
"kl": 0.0267333984375,
"learning_rate": 9.70138888888889e-07,
"loss": 0.0,
"reward": 1.46875,
"reward_std": 0.3377464786171913,
"rewards/accuracy_reward": 0.46875,
"rewards/format_reward": 1.0,
"step": 43
},
{
"clip_ratio": 0.0,
"completion_length": 339.375,
"epoch": 0.09166666666666666,
"grad_norm": 5.377166748046875,
"kl": 0.027923583984375,
"learning_rate": 9.694444444444444e-07,
"loss": 0.0,
"reward": 1.09375,
"reward_std": 0.494472935795784,
"rewards/accuracy_reward": 0.3125,
"rewards/format_reward": 0.78125,
"step": 44
},
{
"clip_ratio": 0.0,
"completion_length": 413.1875,
"epoch": 0.09375,
"grad_norm": 11.015312194824219,
"kl": 0.02325439453125,
"learning_rate": 9.6875e-07,
"loss": 0.0,
"reward": 1.09375,
"reward_std": 0.3377464786171913,
"rewards/accuracy_reward": 0.34375,
"rewards/format_reward": 0.75,
"step": 45
},
{
"clip_ratio": 0.0,
"completion_length": 352.1875,
"epoch": 0.09583333333333334,
"grad_norm": 9.002971649169922,
"kl": 0.032989501953125,
"learning_rate": 9.680555555555555e-07,
"loss": 0.0,
"reward": 1.21875,
"reward_std": 0.4218914955854416,
"rewards/accuracy_reward": 0.21875,
"rewards/format_reward": 1.0,
"step": 46
},
{
"clip_ratio": 0.0,
"completion_length": 353.6875,
"epoch": 0.09791666666666667,
"grad_norm": 9.63524055480957,
"kl": 0.03021240234375,
"learning_rate": 9.673611111111111e-07,
"loss": 0.0,
"reward": 0.875,
"reward_std": 0.4080249145627022,
"rewards/accuracy_reward": 0.1875,
"rewards/format_reward": 0.6875,
"step": 47
},
{
"clip_ratio": 0.0,
"completion_length": 330.6875,
"epoch": 0.1,
"grad_norm": 6.078580379486084,
"kl": 0.03594970703125,
"learning_rate": 9.666666666666666e-07,
"loss": 0.0,
"reward": 0.625,
"reward_std": 0.3535533845424652,
"rewards/accuracy_reward": 0.0625,
"rewards/format_reward": 0.5625,
"step": 48
},
{
"clip_ratio": 0.0,
"completion_length": 331.53125,
"epoch": 0.10208333333333333,
"grad_norm": 2.8953256607055664,
"kl": 0.034576416015625,
"learning_rate": 9.659722222222222e-07,
"loss": 0.0,
"reward": 0.90625,
"reward_std": 0.3061639815568924,
"rewards/accuracy_reward": 0.15625,
"rewards/format_reward": 0.75,
"step": 49
},
{
"clip_ratio": 0.0,
"completion_length": 345.125,
"epoch": 0.10416666666666667,
"grad_norm": 1.4677155017852783,
"kl": 0.031280517578125,
"learning_rate": 9.652777777777777e-07,
"loss": 0.0,
"reward": 0.78125,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.28125,
"rewards/format_reward": 0.5,
"step": 50
},
{
"clip_ratio": 0.0,
"completion_length": 321.03125,
"epoch": 0.10625,
"grad_norm": 4.762105464935303,
"kl": 0.03411865234375,
"learning_rate": 9.645833333333333e-07,
"loss": 0.0,
"reward": 0.71875,
"reward_std": 0.3987956568598747,
"rewards/accuracy_reward": 0.15625,
"rewards/format_reward": 0.5625,
"step": 51
},
{
"clip_ratio": 0.0,
"completion_length": 357.09375,
"epoch": 0.10833333333333334,
"grad_norm": 5.787018775939941,
"kl": 0.03045654296875,
"learning_rate": 9.638888888888888e-07,
"loss": 0.0,
"reward": 1.59375,
"reward_std": 0.4534739926457405,
"rewards/accuracy_reward": 0.59375,
"rewards/format_reward": 1.0,
"step": 52
},
{
"clip_ratio": 0.0,
"completion_length": 364.125,
"epoch": 0.11041666666666666,
"grad_norm": 7.240172386169434,
"kl": 0.03271484375,
"learning_rate": 9.631944444444444e-07,
"loss": 0.0,
"reward": 0.875,
"reward_std": 0.3745020925998688,
"rewards/accuracy_reward": 0.3125,
"rewards/format_reward": 0.5625,
"step": 53
},
{
"clip_ratio": 0.0,
"completion_length": 373.3125,
"epoch": 0.1125,
"grad_norm": 5.157228946685791,
"kl": 0.030731201171875,
"learning_rate": 9.624999999999999e-07,
"loss": 0.0,
"reward": 1.34375,
"reward_std": 0.5038893818855286,
"rewards/accuracy_reward": 0.34375,
"rewards/format_reward": 1.0,
"step": 54
},
{
"clip_ratio": 0.0,
"completion_length": 415.8125,
"epoch": 0.11458333333333333,
"grad_norm": 2.8105556964874268,
"kl": 0.028778076171875,
"learning_rate": 9.618055555555555e-07,
"loss": 0.0,
"reward": 1.0625,
"reward_std": 0.3745020925998688,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.8125,
"step": 55
},
{
"clip_ratio": 0.0,
"completion_length": 344.0625,
"epoch": 0.11666666666666667,
"grad_norm": 4.457938194274902,
"kl": 0.0325927734375,
"learning_rate": 9.61111111111111e-07,
"loss": 0.0,
"reward": 1.28125,
"reward_std": 0.2651650384068489,
"rewards/accuracy_reward": 0.28125,
"rewards/format_reward": 1.0,
"step": 56
},
{
"clip_ratio": 0.0,
"completion_length": 349.90625,
"epoch": 0.11875,
"grad_norm": 29.321313858032227,
"kl": 0.03338623046875,
"learning_rate": 9.604166666666666e-07,
"loss": 0.0,
"reward": 0.78125,
"reward_std": 0.3471629247069359,
"rewards/accuracy_reward": 0.15625,
"rewards/format_reward": 0.625,
"step": 57
},
{
"clip_ratio": 0.0,
"completion_length": 336.03125,
"epoch": 0.12083333333333333,
"grad_norm": 3.946969747543335,
"kl": 0.0340576171875,
"learning_rate": 9.597222222222223e-07,
"loss": 0.0,
"reward": 1.53125,
"reward_std": 0.494472935795784,
"rewards/accuracy_reward": 0.53125,
"rewards/format_reward": 1.0,
"step": 58
},
{
"clip_ratio": 0.0,
"completion_length": 401.0625,
"epoch": 0.12291666666666666,
"grad_norm": 5.483251094818115,
"kl": 0.03076171875,
"learning_rate": 9.590277777777777e-07,
"loss": 0.0,
"reward": 1.125,
"reward_std": 0.408231720328331,
"rewards/accuracy_reward": 0.3125,
"rewards/format_reward": 0.8125,
"step": 59
},
{
"clip_ratio": 0.0,
"completion_length": 390.5,
"epoch": 0.125,
"grad_norm": 3.5011932849884033,
"kl": 0.029449462890625,
"learning_rate": 9.583333333333334e-07,
"loss": 0.0,
"reward": 1.09375,
"reward_std": 0.5038893818855286,
"rewards/accuracy_reward": 0.28125,
"rewards/format_reward": 0.8125,
"step": 60
},
{
"clip_ratio": 0.0,
"completion_length": 414.59375,
"epoch": 0.12708333333333333,
"grad_norm": 9.172750473022461,
"kl": 0.02972412109375,
"learning_rate": 9.576388888888888e-07,
"loss": 0.0,
"reward": 1.59375,
"reward_std": 0.38816186785697937,
"rewards/accuracy_reward": 0.59375,
"rewards/format_reward": 1.0,
"step": 61
},
{
"clip_ratio": 0.0,
"completion_length": 354.21875,
"epoch": 0.12916666666666668,
"grad_norm": 3.7546005249023438,
"kl": 0.03594970703125,
"learning_rate": 9.569444444444445e-07,
"loss": 0.0,
"reward": 1.03125,
"reward_std": 0.35564958304166794,
"rewards/accuracy_reward": 0.28125,
"rewards/format_reward": 0.75,
"step": 62
},
{
"clip_ratio": 0.0,
"completion_length": 361.03125,
"epoch": 0.13125,
"grad_norm": 3.4299263954162598,
"kl": 0.03411865234375,
"learning_rate": 9.5625e-07,
"loss": 0.0,
"reward": 0.8125,
"reward_std": 0.4671337679028511,
"rewards/accuracy_reward": 0.21875,
"rewards/format_reward": 0.59375,
"step": 63
},
{
"clip_ratio": 0.0,
"completion_length": 402.6875,
"epoch": 0.13333333333333333,
"grad_norm": 2.6421077251434326,
"kl": 0.034515380859375,
"learning_rate": 9.555555555555556e-07,
"loss": 0.0,
"reward": 0.96875,
"reward_std": 0.3377464786171913,
"rewards/accuracy_reward": 0.1875,
"rewards/format_reward": 0.78125,
"step": 64
},
{
"clip_ratio": 0.0,
"completion_length": 333.3125,
"epoch": 0.13541666666666666,
"grad_norm": 3.543858051300049,
"kl": 0.0399169921875,
"learning_rate": 9.54861111111111e-07,
"loss": 0.0,
"reward": 1.40625,
"reward_std": 0.4628904387354851,
"rewards/accuracy_reward": 0.40625,
"rewards/format_reward": 1.0,
"step": 65
},
{
"clip_ratio": 0.0,
"completion_length": 345.40625,
"epoch": 0.1375,
"grad_norm": 4.455636501312256,
"kl": 0.04608154296875,
"learning_rate": 9.541666666666667e-07,
"loss": 0.0,
"reward": 1.28125,
"reward_std": 0.5116237476468086,
"rewards/accuracy_reward": 0.34375,
"rewards/format_reward": 0.9375,
"step": 66
},
{
"clip_ratio": 0.0,
"completion_length": 364.9375,
"epoch": 0.13958333333333334,
"grad_norm": 6.6961259841918945,
"kl": 0.04595947265625,
"learning_rate": 9.534722222222223e-07,
"loss": 0.0,
"reward": 0.8125,
"reward_std": 0.4671337679028511,
"rewards/accuracy_reward": 0.09375,
"rewards/format_reward": 0.71875,
"step": 67
},
{
"clip_ratio": 0.0,
"completion_length": 376.59375,
"epoch": 0.14166666666666666,
"grad_norm": 1.9689209461212158,
"kl": 0.033203125,
"learning_rate": 9.527777777777777e-07,
"loss": 0.0,
"reward": 0.71875,
"reward_std": 0.2630179077386856,
"rewards/accuracy_reward": 0.21875,
"rewards/format_reward": 0.5,
"step": 68
},
{
"clip_ratio": 0.0,
"completion_length": 354.6875,
"epoch": 0.14375,
"grad_norm": 4.801360130310059,
"kl": 0.0440673828125,
"learning_rate": 9.520833333333333e-07,
"loss": 0.0,
"reward": 1.0,
"reward_std": 0.3514062538743019,
"rewards/accuracy_reward": 0.21875,
"rewards/format_reward": 0.78125,
"step": 69
},
{
"clip_ratio": 0.0,
"completion_length": 359.375,
"epoch": 0.14583333333333334,
"grad_norm": 4.846868515014648,
"kl": 0.036529541015625,
"learning_rate": 9.513888888888888e-07,
"loss": 0.0,
"reward": 1.4375,
"reward_std": 0.49022960662841797,
"rewards/accuracy_reward": 0.4375,
"rewards/format_reward": 1.0,
"step": 70
},
{
"clip_ratio": 0.0,
"completion_length": 342.5,
"epoch": 0.14791666666666667,
"grad_norm": 4.30186128616333,
"kl": 0.06610107421875,
"learning_rate": 9.506944444444444e-07,
"loss": 0.0001,
"reward": 1.03125,
"reward_std": 0.3808925524353981,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.78125,
"step": 71
},
{
"clip_ratio": 0.0,
"completion_length": 335.40625,
"epoch": 0.15,
"grad_norm": 3.9767167568206787,
"kl": 0.05157470703125,
"learning_rate": 9.499999999999999e-07,
"loss": 0.0001,
"reward": 0.8125,
"reward_std": 0.4355512708425522,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.5625,
"step": 72
},
{
"clip_ratio": 0.0,
"completion_length": 376.65625,
"epoch": 0.15208333333333332,
"grad_norm": 2.6517255306243896,
"kl": 0.03857421875,
"learning_rate": 9.493055555555555e-07,
"loss": 0.0,
"reward": 0.84375,
"reward_std": 0.2041158601641655,
"rewards/accuracy_reward": 0.09375,
"rewards/format_reward": 0.75,
"step": 73
},
{
"clip_ratio": 0.0,
"completion_length": 382.0,
"epoch": 0.15416666666666667,
"grad_norm": 6.51157283782959,
"kl": 0.04010009765625,
"learning_rate": 9.48611111111111e-07,
"loss": 0.0,
"reward": 1.53125,
"reward_std": 0.5038893818855286,
"rewards/accuracy_reward": 0.53125,
"rewards/format_reward": 1.0,
"step": 74
},
{
"clip_ratio": 0.0,
"completion_length": 390.6875,
"epoch": 0.15625,
"grad_norm": 4.548913478851318,
"kl": 0.0443115234375,
"learning_rate": 9.479166666666666e-07,
"loss": 0.0,
"reward": 0.53125,
"reward_std": 0.3198433741927147,
"rewards/accuracy_reward": 0.1875,
"rewards/format_reward": 0.34375,
"step": 75
},
{
"clip_ratio": 0.0,
"completion_length": 373.34375,
"epoch": 0.15833333333333333,
"grad_norm": 3.1202585697174072,
"kl": 0.04510498046875,
"learning_rate": 9.472222222222221e-07,
"loss": 0.0,
"reward": 0.9375,
"reward_std": 0.3104073107242584,
"rewards/accuracy_reward": 0.15625,
"rewards/format_reward": 0.78125,
"step": 76
},
{
"clip_ratio": 0.0,
"completion_length": 354.25,
"epoch": 0.16041666666666668,
"grad_norm": 2.928286552429199,
"kl": 0.04559326171875,
"learning_rate": 9.465277777777777e-07,
"loss": 0.0,
"reward": 1.15625,
"reward_std": 0.3608423173427582,
"rewards/accuracy_reward": 0.40625,
"rewards/format_reward": 0.75,
"step": 77
},
{
"clip_ratio": 0.0,
"completion_length": 381.3125,
"epoch": 0.1625,
"grad_norm": 5.202105522155762,
"kl": 0.04644775390625,
"learning_rate": 9.458333333333333e-07,
"loss": 0.0,
"reward": 1.5,
"reward_std": 0.44403792917728424,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 1.0,
"step": 78
},
{
"clip_ratio": 0.0,
"completion_length": 374.21875,
"epoch": 0.16458333333333333,
"grad_norm": 6.891729831695557,
"kl": 0.04815673828125,
"learning_rate": 9.451388888888889e-07,
"loss": 0.0,
"reward": 0.40625,
"reward_std": 0.3808925524353981,
"rewards/accuracy_reward": 0.03125,
"rewards/format_reward": 0.375,
"step": 79
},
{
"clip_ratio": 0.0,
"completion_length": 362.5625,
"epoch": 0.16666666666666666,
"grad_norm": 5.241474151611328,
"kl": 0.04620361328125,
"learning_rate": 9.444444444444444e-07,
"loss": 0.0,
"reward": 1.0625,
"reward_std": 0.4671337679028511,
"rewards/accuracy_reward": 0.28125,
"rewards/format_reward": 0.78125,
"step": 80
},
{
"clip_ratio": 0.0,
"completion_length": 376.96875,
"epoch": 0.16875,
"grad_norm": 4.389130592346191,
"kl": 0.048095703125,
"learning_rate": 9.4375e-07,
"loss": 0.0,
"reward": 1.34375,
"reward_std": 0.4628904387354851,
"rewards/accuracy_reward": 0.34375,
"rewards/format_reward": 1.0,
"step": 81
},
{
"clip_ratio": 0.0,
"completion_length": 398.84375,
"epoch": 0.17083333333333334,
"grad_norm": 8.09018611907959,
"kl": 0.043701171875,
"learning_rate": 9.430555555555555e-07,
"loss": 0.0,
"reward": 0.90625,
"reward_std": 0.3198433741927147,
"rewards/accuracy_reward": 0.125,
"rewards/format_reward": 0.78125,
"step": 82
},
{
"clip_ratio": 0.0,
"completion_length": 386.84375,
"epoch": 0.17291666666666666,
"grad_norm": 2.641420602798462,
"kl": 0.04937744140625,
"learning_rate": 9.423611111111111e-07,
"loss": 0.0,
"reward": 1.40625,
"reward_std": 0.3787454217672348,
"rewards/accuracy_reward": 0.40625,
"rewards/format_reward": 1.0,
"step": 83
},
{
"clip_ratio": 0.0,
"completion_length": 355.78125,
"epoch": 0.175,
"grad_norm": 7.009854793548584,
"kl": 0.04791259765625,
"learning_rate": 9.416666666666666e-07,
"loss": 0.0,
"reward": 1.0625,
"reward_std": 0.4671337679028511,
"rewards/accuracy_reward": 0.28125,
"rewards/format_reward": 0.78125,
"step": 84
},
{
"clip_ratio": 0.0,
"completion_length": 394.3125,
"epoch": 0.17708333333333334,
"grad_norm": 2.642425537109375,
"kl": 0.0491943359375,
"learning_rate": 9.409722222222222e-07,
"loss": 0.0,
"reward": 1.15625,
"reward_std": 0.3198433741927147,
"rewards/accuracy_reward": 0.40625,
"rewards/format_reward": 0.75,
"step": 85
},
{
"clip_ratio": 0.0,
"completion_length": 424.03125,
"epoch": 0.17916666666666667,
"grad_norm": 2.3896079063415527,
"kl": 0.04443359375,
"learning_rate": 9.402777777777777e-07,
"loss": 0.0,
"reward": 0.9375,
"reward_std": 0.3335031494498253,
"rewards/accuracy_reward": 0.21875,
"rewards/format_reward": 0.71875,
"step": 86
},
{
"clip_ratio": 0.0,
"completion_length": 404.34375,
"epoch": 0.18125,
"grad_norm": 4.422949314117432,
"kl": 0.04510498046875,
"learning_rate": 9.395833333333333e-07,
"loss": 0.0,
"reward": 1.0,
"reward_std": 0.3745020925998688,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.75,
"step": 87
},
{
"clip_ratio": 0.0,
"completion_length": 438.65625,
"epoch": 0.18333333333333332,
"grad_norm": 3.236891746520996,
"kl": 0.04571533203125,
"learning_rate": 9.388888888888888e-07,
"loss": 0.0,
"reward": 0.9375,
"reward_std": 0.3104073107242584,
"rewards/accuracy_reward": 0.1875,
"rewards/format_reward": 0.75,
"step": 88
},
{
"clip_ratio": 0.0,
"completion_length": 434.9375,
"epoch": 0.18541666666666667,
"grad_norm": 8.649247169494629,
"kl": 0.04290771484375,
"learning_rate": 9.381944444444444e-07,
"loss": 0.0,
"reward": 0.78125,
"reward_std": 0.24511480331420898,
"rewards/accuracy_reward": 0.28125,
"rewards/format_reward": 0.5,
"step": 89
},
{
"clip_ratio": 0.0,
"completion_length": 429.125,
"epoch": 0.1875,
"grad_norm": 3.4409844875335693,
"kl": 0.04608154296875,
"learning_rate": 9.374999999999999e-07,
"loss": 0.0,
"reward": 1.125,
"reward_std": 0.4492306634783745,
"rewards/accuracy_reward": 0.34375,
"rewards/format_reward": 0.78125,
"step": 90
},
{
"clip_ratio": 0.0,
"completion_length": 446.4375,
"epoch": 0.18958333333333333,
"grad_norm": 7.360363483428955,
"kl": 0.05267333984375,
"learning_rate": 9.368055555555555e-07,
"loss": 0.0001,
"reward": 1.125,
"reward_std": 0.3335031494498253,
"rewards/accuracy_reward": 0.34375,
"rewards/format_reward": 0.78125,
"step": 91
},
{
"clip_ratio": 0.0,
"completion_length": 470.03125,
"epoch": 0.19166666666666668,
"grad_norm": 1.7753536701202393,
"kl": 0.046875,
"learning_rate": 9.361111111111111e-07,
"loss": 0.0,
"reward": 1.15625,
"reward_std": 0.2041158601641655,
"rewards/accuracy_reward": 0.40625,
"rewards/format_reward": 0.75,
"step": 92
},
{
"clip_ratio": 0.0,
"completion_length": 442.65625,
"epoch": 0.19375,
"grad_norm": 16.041807174682617,
"kl": 0.05078125,
"learning_rate": 9.354166666666667e-07,
"loss": 0.0001,
"reward": 0.96875,
"reward_std": 0.494472935795784,
"rewards/accuracy_reward": 0.1875,
"rewards/format_reward": 0.78125,
"step": 93
},
{
"clip_ratio": 0.0,
"completion_length": 425.90625,
"epoch": 0.19583333333333333,
"grad_norm": 5.043431758880615,
"kl": 0.05303955078125,
"learning_rate": 9.347222222222222e-07,
"loss": 0.0001,
"reward": 1.28125,
"reward_std": 0.35564958304166794,
"rewards/accuracy_reward": 0.28125,
"rewards/format_reward": 1.0,
"step": 94
},
{
"clip_ratio": 0.0,
"completion_length": 457.1875,
"epoch": 0.19791666666666666,
"grad_norm": 3.5956928730010986,
"kl": 0.0496826171875,
"learning_rate": 9.340277777777778e-07,
"loss": 0.0,
"reward": 0.96875,
"reward_std": 0.2630179077386856,
"rewards/accuracy_reward": 0.21875,
"rewards/format_reward": 0.75,
"step": 95
},
{
"clip_ratio": 0.0,
"completion_length": 456.5,
"epoch": 0.2,
"grad_norm": 2.4647815227508545,
"kl": 0.0474853515625,
"learning_rate": 9.333333333333333e-07,
"loss": 0.0,
"reward": 1.03125,
"reward_std": 0.35564958304166794,
"rewards/accuracy_reward": 0.28125,
"rewards/format_reward": 0.75,
"step": 96
},
{
"clip_ratio": 0.0,
"completion_length": 423.09375,
"epoch": 0.20208333333333334,
"grad_norm": 4.0429534912109375,
"kl": 0.05426025390625,
"learning_rate": 9.326388888888889e-07,
"loss": 0.0001,
"reward": 1.6875,
"reward_std": 0.49022960662841797,
"rewards/accuracy_reward": 0.6875,
"rewards/format_reward": 1.0,
"step": 97
},
{
"clip_ratio": 0.0,
"completion_length": 446.46875,
"epoch": 0.20416666666666666,
"grad_norm": 4.266984939575195,
"kl": 0.07110595703125,
"learning_rate": 9.319444444444444e-07,
"loss": 0.0001,
"reward": 1.4375,
"reward_std": 0.38298875093460083,
"rewards/accuracy_reward": 0.4375,
"rewards/format_reward": 1.0,
"step": 98
},
{
"clip_ratio": 0.0,
"completion_length": 463.28125,
"epoch": 0.20625,
"grad_norm": 2.255748748779297,
"kl": 0.04803466796875,
"learning_rate": 9.3125e-07,
"loss": 0.0,
"reward": 1.0625,
"reward_std": 0.3514062538743019,
"rewards/accuracy_reward": 0.3125,
"rewards/format_reward": 0.75,
"step": 99
},
{
"clip_ratio": 0.0,
"completion_length": 431.90625,
"epoch": 0.20833333333333334,
"grad_norm": 2.3120131492614746,
"kl": 0.05810546875,
"learning_rate": 9.305555555555555e-07,
"loss": 0.0001,
"reward": 1.09375,
"reward_std": 0.1293872892856598,
"rewards/accuracy_reward": 0.34375,
"rewards/format_reward": 0.75,
"step": 100
},
{
"clip_ratio": 0.0,
"completion_length": 416.53125,
"epoch": 0.21041666666666667,
"grad_norm": 3.011608839035034,
"kl": 0.048583984375,
"learning_rate": 9.298611111111111e-07,
"loss": 0.0,
"reward": 1.5625,
"reward_std": 0.49022960662841797,
"rewards/accuracy_reward": 0.5625,
"rewards/format_reward": 1.0,
"step": 101
},
{
"clip_ratio": 0.0,
"completion_length": 425.5,
"epoch": 0.2125,
"grad_norm": 3.771697521209717,
"kl": 0.05657958984375,
"learning_rate": 9.291666666666666e-07,
"loss": 0.0001,
"reward": 1.125,
"reward_std": 0.3650856465101242,
"rewards/accuracy_reward": 0.375,
"rewards/format_reward": 0.75,
"step": 102
},
{
"clip_ratio": 0.0,
"completion_length": 397.59375,
"epoch": 0.21458333333333332,
"grad_norm": 3.9330313205718994,
"kl": 0.07916259765625,
"learning_rate": 9.284722222222222e-07,
"loss": 0.0001,
"reward": 1.5,
"reward_std": 0.5081327110528946,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 1.0,
"step": 103
},
{
"clip_ratio": 0.0,
"completion_length": 461.5,
"epoch": 0.21666666666666667,
"grad_norm": 3.8951761722564697,
"kl": 0.04876708984375,
"learning_rate": 9.277777777777777e-07,
"loss": 0.0,
"reward": 1.5625,
"reward_std": 0.49022960662841797,
"rewards/accuracy_reward": 0.5625,
"rewards/format_reward": 1.0,
"step": 104
},
{
"clip_ratio": 0.0,
"completion_length": 402.46875,
"epoch": 0.21875,
"grad_norm": 5.237806797027588,
"kl": 0.05755615234375,
"learning_rate": 9.270833333333333e-07,
"loss": 0.0001,
"reward": 0.8125,
"reward_std": 0.2177756354212761,
"rewards/accuracy_reward": 0.3125,
"rewards/format_reward": 0.5,
"step": 105
},
{
"clip_ratio": 0.0,
"completion_length": 407.625,
"epoch": 0.22083333333333333,
"grad_norm": 2.3644521236419678,
"kl": 0.08203125,
"learning_rate": 9.263888888888889e-07,
"loss": 0.0001,
"reward": 0.78125,
"reward_std": 0.2651650384068489,
"rewards/accuracy_reward": 0.3125,
"rewards/format_reward": 0.46875,
"step": 106
},
{
"clip_ratio": 0.0,
"completion_length": 390.84375,
"epoch": 0.22291666666666668,
"grad_norm": 2.5936055183410645,
"kl": 0.06103515625,
"learning_rate": 9.256944444444445e-07,
"loss": 0.0001,
"reward": 0.90625,
"reward_std": 0.2041158601641655,
"rewards/accuracy_reward": 0.40625,
"rewards/format_reward": 0.5,
"step": 107
},
{
"clip_ratio": 0.0,
"completion_length": 432.15625,
"epoch": 0.225,
"grad_norm": 8.296189308166504,
"kl": 0.05120849609375,
"learning_rate": 9.25e-07,
"loss": 0.0001,
"reward": 1.34375,
"reward_std": 0.4628904387354851,
"rewards/accuracy_reward": 0.34375,
"rewards/format_reward": 1.0,
"step": 108
},
{
"clip_ratio": 0.0,
"completion_length": 378.125,
"epoch": 0.22708333333333333,
"grad_norm": 5.851869106292725,
"kl": 0.06024169921875,
"learning_rate": 9.243055555555556e-07,
"loss": 0.0001,
"reward": 1.0625,
"reward_std": 0.1157275140285492,
"rewards/accuracy_reward": 0.3125,
"rewards/format_reward": 0.75,
"step": 109
},
{
"clip_ratio": 0.0,
"completion_length": 383.75,
"epoch": 0.22916666666666666,
"grad_norm": 2.6394460201263428,
"kl": 0.05841064453125,
"learning_rate": 9.236111111111111e-07,
"loss": 0.0001,
"reward": 1.125,
"reward_std": 0.2177756354212761,
"rewards/accuracy_reward": 0.375,
"rewards/format_reward": 0.75,
"step": 110
},
{
"clip_ratio": 0.0,
"completion_length": 403.0,
"epoch": 0.23125,
"grad_norm": 2.53338885307312,
"kl": 0.06317138671875,
"learning_rate": 9.229166666666667e-07,
"loss": 0.0001,
"reward": 1.0625,
"reward_std": 0.3514062538743019,
"rewards/accuracy_reward": 0.3125,
"rewards/format_reward": 0.75,
"step": 111
},
{
"clip_ratio": 0.0,
"completion_length": 397.1875,
"epoch": 0.23333333333333334,
"grad_norm": 1.3787952661514282,
"kl": 0.06109619140625,
"learning_rate": 9.222222222222222e-07,
"loss": 0.0001,
"reward": 0.90625,
"reward_std": 0.1293872892856598,
"rewards/accuracy_reward": 0.15625,
"rewards/format_reward": 0.75,
"step": 112
},
{
"clip_ratio": 0.0,
"completion_length": 343.53125,
"epoch": 0.23541666666666666,
"grad_norm": 3.1469290256500244,
"kl": 0.06982421875,
"learning_rate": 9.215277777777777e-07,
"loss": 0.0001,
"reward": 1.09375,
"reward_std": 0.3608423173427582,
"rewards/accuracy_reward": 0.34375,
"rewards/format_reward": 0.75,
"step": 113
},
{
"clip_ratio": 0.0,
"completion_length": 393.375,
"epoch": 0.2375,
"grad_norm": 2.1029345989227295,
"kl": 0.064208984375,
"learning_rate": 9.208333333333332e-07,
"loss": 0.0001,
"reward": 0.65625,
"reward_std": 0.22201896458864212,
"rewards/accuracy_reward": 0.15625,
"rewards/format_reward": 0.5,
"step": 114
},
{
"clip_ratio": 0.0,
"completion_length": 365.65625,
"epoch": 0.23958333333333334,
"grad_norm": 7.250767230987549,
"kl": 0.06658935546875,
"learning_rate": 9.201388888888888e-07,
"loss": 0.0001,
"reward": 0.96875,
"reward_std": 0.24511480331420898,
"rewards/accuracy_reward": 0.21875,
"rewards/format_reward": 0.75,
"step": 115
},
{
"clip_ratio": 0.0,
"completion_length": 358.8125,
"epoch": 0.24166666666666667,
"grad_norm": 2.3046934604644775,
"kl": 0.064208984375,
"learning_rate": 9.194444444444443e-07,
"loss": 0.0001,
"reward": 1.0,
"reward_std": 0.26726123690605164,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.75,
"step": 116
},
{
"clip_ratio": 0.0,
"completion_length": 347.0,
"epoch": 0.24375,
"grad_norm": 1.9355486631393433,
"kl": 0.0821533203125,
"learning_rate": 9.187499999999999e-07,
"loss": 0.0001,
"reward": 0.71875,
"reward_std": 0.2041158601641655,
"rewards/accuracy_reward": 0.21875,
"rewards/format_reward": 0.5,
"step": 117
},
{
"clip_ratio": 0.0,
"completion_length": 332.25,
"epoch": 0.24583333333333332,
"grad_norm": 2.3174281120300293,
"kl": 0.0784912109375,
"learning_rate": 9.180555555555554e-07,
"loss": 0.0001,
"reward": 0.65625,
"reward_std": 0.1293872892856598,
"rewards/accuracy_reward": 0.15625,
"rewards/format_reward": 0.5,
"step": 118
},
{
"clip_ratio": 0.0,
"completion_length": 358.75,
"epoch": 0.24791666666666667,
"grad_norm": 3.247333288192749,
"kl": 0.0699462890625,
"learning_rate": 9.17361111111111e-07,
"loss": 0.0001,
"reward": 1.09375,
"reward_std": 0.3061639815568924,
"rewards/accuracy_reward": 0.34375,
"rewards/format_reward": 0.75,
"step": 119
},
{
"clip_ratio": 0.0,
"completion_length": 350.6875,
"epoch": 0.25,
"grad_norm": 2.650913953781128,
"kl": 0.07958984375,
"learning_rate": 9.166666666666665e-07,
"loss": 0.0001,
"reward": 1.65625,
"reward_std": 0.2041158601641655,
"rewards/accuracy_reward": 0.65625,
"rewards/format_reward": 1.0,
"step": 120
},
{
"clip_ratio": 0.0,
"completion_length": 324.34375,
"epoch": 0.2520833333333333,
"grad_norm": 4.5364603996276855,
"kl": 0.0885009765625,
"learning_rate": 9.159722222222222e-07,
"loss": 0.0001,
"reward": 1.34375,
"reward_std": 0.3061639815568924,
"rewards/accuracy_reward": 0.59375,
"rewards/format_reward": 0.75,
"step": 121
},
{
"clip_ratio": 0.0,
"completion_length": 315.4375,
"epoch": 0.25416666666666665,
"grad_norm": 2.341081142425537,
"kl": 0.134765625,
"learning_rate": 9.152777777777777e-07,
"loss": 0.0001,
"reward": 0.59375,
"reward_std": 0.2041158601641655,
"rewards/accuracy_reward": 0.09375,
"rewards/format_reward": 0.5,
"step": 122
},
{
"clip_ratio": 0.0,
"completion_length": 310.4375,
"epoch": 0.25625,
"grad_norm": 1.698325276374817,
"kl": 0.1190185546875,
"learning_rate": 9.145833333333333e-07,
"loss": 0.0001,
"reward": 0.3125,
"reward_std": 0.1157275140285492,
"rewards/accuracy_reward": 0.0625,
"rewards/format_reward": 0.25,
"step": 123
},
{
"clip_ratio": 0.0,
"completion_length": 327.5,
"epoch": 0.25833333333333336,
"grad_norm": 2.0879573822021484,
"kl": 0.1610107421875,
"learning_rate": 9.138888888888888e-07,
"loss": 0.0002,
"reward": 1.09375,
"reward_std": 0.22201896458864212,
"rewards/accuracy_reward": 0.34375,
"rewards/format_reward": 0.75,
"step": 124
},
{
"clip_ratio": 0.0,
"completion_length": 326.40625,
"epoch": 0.2604166666666667,
"grad_norm": 8.24306869506836,
"kl": 0.0919189453125,
"learning_rate": 9.131944444444444e-07,
"loss": 0.0001,
"reward": 1.75,
"reward_std": 0.3335031494498253,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 1.0,
"step": 125
},
{
"clip_ratio": 0.0,
"completion_length": 258.65625,
"epoch": 0.2625,
"grad_norm": 4.286442279815674,
"kl": 0.1165771484375,
"learning_rate": 9.124999999999999e-07,
"loss": 0.0001,
"reward": 1.1875,
"reward_std": 0.38298875093460083,
"rewards/accuracy_reward": 0.4375,
"rewards/format_reward": 0.75,
"step": 126
},
{
"clip_ratio": 0.0,
"completion_length": 279.8125,
"epoch": 0.26458333333333334,
"grad_norm": 36.095619201660156,
"kl": 0.1119384765625,
"learning_rate": 9.118055555555555e-07,
"loss": 0.0001,
"reward": 1.5625,
"reward_std": 0.48503687232732773,
"rewards/accuracy_reward": 0.5625,
"rewards/format_reward": 1.0,
"step": 127
},
{
"clip_ratio": 0.0,
"completion_length": 290.15625,
"epoch": 0.26666666666666666,
"grad_norm": 2.5451955795288086,
"kl": 0.1114501953125,
"learning_rate": 9.11111111111111e-07,
"loss": 0.0001,
"reward": 1.25,
"reward_std": 0.3335031494498253,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.75,
"step": 128
},
{
"clip_ratio": 0.0,
"completion_length": 266.4375,
"epoch": 0.26875,
"grad_norm": 2.765319347381592,
"kl": 0.1148681640625,
"learning_rate": 9.104166666666666e-07,
"loss": 0.0001,
"reward": 0.84375,
"reward_std": 0.22201896458864212,
"rewards/accuracy_reward": 0.34375,
"rewards/format_reward": 0.5,
"step": 129
},
{
"clip_ratio": 0.0,
"completion_length": 280.3125,
"epoch": 0.2708333333333333,
"grad_norm": 4.0128583908081055,
"kl": 0.1129150390625,
"learning_rate": 9.097222222222221e-07,
"loss": 0.0001,
"reward": 0.28125,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.03125,
"rewards/format_reward": 0.25,
"step": 130
},
{
"clip_ratio": 0.0,
"completion_length": 244.5625,
"epoch": 0.27291666666666664,
"grad_norm": 4.9140801429748535,
"kl": 0.133544921875,
"learning_rate": 9.090277777777777e-07,
"loss": 0.0001,
"reward": 1.6875,
"reward_std": 0.4671337679028511,
"rewards/accuracy_reward": 0.6875,
"rewards/format_reward": 1.0,
"step": 131
},
{
"clip_ratio": 0.0,
"completion_length": 245.96875,
"epoch": 0.275,
"grad_norm": 3.4184932708740234,
"kl": 0.1329345703125,
"learning_rate": 9.083333333333332e-07,
"loss": 0.0001,
"reward": 0.65625,
"reward_std": 0.24511480331420898,
"rewards/accuracy_reward": 0.15625,
"rewards/format_reward": 0.5,
"step": 132
},
{
"clip_ratio": 0.0,
"completion_length": 264.90625,
"epoch": 0.27708333333333335,
"grad_norm": 4.854220390319824,
"kl": 0.1396484375,
"learning_rate": 9.076388888888888e-07,
"loss": 0.0001,
"reward": 1.46875,
"reward_std": 0.3808925524353981,
"rewards/accuracy_reward": 0.46875,
"rewards/format_reward": 1.0,
"step": 133
},
{
"clip_ratio": 0.0,
"completion_length": 262.5,
"epoch": 0.2791666666666667,
"grad_norm": 3.3047986030578613,
"kl": 0.146240234375,
"learning_rate": 9.069444444444443e-07,
"loss": 0.0001,
"reward": 1.25,
"reward_std": 0.3650856465101242,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.75,
"step": 134
},
{
"clip_ratio": 0.0,
"completion_length": 264.84375,
"epoch": 0.28125,
"grad_norm": 6.767662048339844,
"kl": 0.1334228515625,
"learning_rate": 9.0625e-07,
"loss": 0.0001,
"reward": 1.0625,
"reward_std": 0.3335031494498253,
"rewards/accuracy_reward": 0.3125,
"rewards/format_reward": 0.75,
"step": 135
},
{
"clip_ratio": 0.0,
"completion_length": 246.625,
"epoch": 0.2833333333333333,
"grad_norm": 2.0335617065429688,
"kl": 0.142578125,
"learning_rate": 9.055555555555556e-07,
"loss": 0.0001,
"reward": 0.4375,
"reward_std": 0.1157275140285492,
"rewards/accuracy_reward": 0.1875,
"rewards/format_reward": 0.25,
"step": 136
},
{
"clip_ratio": 0.0,
"completion_length": 278.875,
"epoch": 0.28541666666666665,
"grad_norm": 12.565716743469238,
"kl": 0.1414794921875,
"learning_rate": 9.048611111111111e-07,
"loss": 0.0001,
"reward": 1.375,
"reward_std": 0.4492306634783745,
"rewards/accuracy_reward": 0.375,
"rewards/format_reward": 1.0,
"step": 137
},
{
"clip_ratio": 0.0,
"completion_length": 271.65625,
"epoch": 0.2875,
"grad_norm": 9.577066421508789,
"kl": 0.1416015625,
"learning_rate": 9.041666666666667e-07,
"loss": 0.0001,
"reward": 1.34375,
"reward_std": 0.22201896458864212,
"rewards/accuracy_reward": 0.59375,
"rewards/format_reward": 0.75,
"step": 138
},
{
"clip_ratio": 0.0,
"completion_length": 269.375,
"epoch": 0.28958333333333336,
"grad_norm": 4.135301113128662,
"kl": 0.1522216796875,
"learning_rate": 9.034722222222222e-07,
"loss": 0.0002,
"reward": 1.125,
"reward_std": 0.2177756354212761,
"rewards/accuracy_reward": 0.375,
"rewards/format_reward": 0.75,
"step": 139
},
{
"clip_ratio": 0.0,
"completion_length": 278.59375,
"epoch": 0.2916666666666667,
"grad_norm": 2.6078929901123047,
"kl": 0.1363525390625,
"learning_rate": 9.027777777777778e-07,
"loss": 0.0001,
"reward": 0.4375,
"reward_std": 0.1157275140285492,
"rewards/accuracy_reward": 0.1875,
"rewards/format_reward": 0.25,
"step": 140
},
{
"clip_ratio": 0.0,
"completion_length": 264.71875,
"epoch": 0.29375,
"grad_norm": 23.88492774963379,
"kl": 0.1279296875,
"learning_rate": 9.020833333333333e-07,
"loss": 0.0001,
"reward": 1.21875,
"reward_std": 0.2041158601641655,
"rewards/accuracy_reward": 0.46875,
"rewards/format_reward": 0.75,
"step": 141
},
{
"clip_ratio": 0.0,
"completion_length": 268.03125,
"epoch": 0.29583333333333334,
"grad_norm": 2.0252206325531006,
"kl": 0.139404296875,
"learning_rate": 9.013888888888889e-07,
"loss": 0.0001,
"reward": 1.625,
"reward_std": 0.13363061845302582,
"rewards/accuracy_reward": 0.625,
"rewards/format_reward": 1.0,
"step": 142
},
{
"clip_ratio": 0.0,
"completion_length": 293.4375,
"epoch": 0.29791666666666666,
"grad_norm": 3.3563263416290283,
"kl": 0.12841796875,
"learning_rate": 9.006944444444444e-07,
"loss": 0.0001,
"reward": 1.03125,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.28125,
"rewards/format_reward": 0.75,
"step": 143
},
{
"clip_ratio": 0.0,
"completion_length": 273.71875,
"epoch": 0.3,
"grad_norm": 2.928553819656372,
"kl": 0.156982421875,
"learning_rate": 9e-07,
"loss": 0.0002,
"reward": 1.65625,
"reward_std": 0.3377464786171913,
"rewards/accuracy_reward": 0.65625,
"rewards/format_reward": 1.0,
"step": 144
},
{
"clip_ratio": 0.0,
"completion_length": 274.5625,
"epoch": 0.3020833333333333,
"grad_norm": 2.215178966522217,
"kl": 0.145263671875,
"learning_rate": 8.993055555555555e-07,
"loss": 0.0001,
"reward": 0.875,
"reward_std": 0.13363061845302582,
"rewards/accuracy_reward": 0.375,
"rewards/format_reward": 0.5,
"step": 145
},
{
"clip_ratio": 0.0,
"completion_length": 280.3125,
"epoch": 0.30416666666666664,
"grad_norm": 2.916330337524414,
"kl": 0.142822265625,
"learning_rate": 8.986111111111111e-07,
"loss": 0.0001,
"reward": 1.78125,
"reward_std": 0.3061639815568924,
"rewards/accuracy_reward": 0.78125,
"rewards/format_reward": 1.0,
"step": 146
},
{
"clip_ratio": 0.0,
"completion_length": 290.5625,
"epoch": 0.30625,
"grad_norm": 2.6211419105529785,
"kl": 0.1351318359375,
"learning_rate": 8.979166666666666e-07,
"loss": 0.0001,
"reward": 1.21875,
"reward_std": 0.2041158601641655,
"rewards/accuracy_reward": 0.46875,
"rewards/format_reward": 0.75,
"step": 147
},
{
"clip_ratio": 0.0,
"completion_length": 301.46875,
"epoch": 0.30833333333333335,
"grad_norm": 9.624622344970703,
"kl": 0.154296875,
"learning_rate": 8.972222222222222e-07,
"loss": 0.0002,
"reward": 1.5625,
"reward_std": 0.5468482673168182,
"rewards/accuracy_reward": 0.59375,
"rewards/format_reward": 0.96875,
"step": 148
},
{
"clip_ratio": 0.0,
"completion_length": 275.5,
"epoch": 0.3104166666666667,
"grad_norm": 1.7619229555130005,
"kl": 0.137939453125,
"learning_rate": 8.965277777777778e-07,
"loss": 0.0001,
"reward": 0.78125,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.28125,
"rewards/format_reward": 0.5,
"step": 149
},
{
"clip_ratio": 0.0,
"completion_length": 320.78125,
"epoch": 0.3125,
"grad_norm": 3.684971570968628,
"kl": 0.12841796875,
"learning_rate": 8.958333333333334e-07,
"loss": 0.0001,
"reward": 1.75,
"reward_std": 0.4261348247528076,
"rewards/accuracy_reward": 0.78125,
"rewards/format_reward": 0.96875,
"step": 150
},
{
"clip_ratio": 0.0,
"completion_length": 304.40625,
"epoch": 0.3145833333333333,
"grad_norm": 3.517716407775879,
"kl": 0.132568359375,
"learning_rate": 8.951388888888889e-07,
"loss": 0.0001,
"reward": 0.8125,
"reward_std": 0.2587745785713196,
"rewards/accuracy_reward": 0.3125,
"rewards/format_reward": 0.5,
"step": 151
},
{
"clip_ratio": 0.0,
"completion_length": 303.125,
"epoch": 0.31666666666666665,
"grad_norm": 2.7190983295440674,
"kl": 0.131103515625,
"learning_rate": 8.944444444444445e-07,
"loss": 0.0001,
"reward": 0.71875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.21875,
"rewards/format_reward": 0.5,
"step": 152
},
{
"clip_ratio": 0.0,
"completion_length": 307.875,
"epoch": 0.31875,
"grad_norm": 3.1140782833099365,
"kl": 0.145751953125,
"learning_rate": 8.9375e-07,
"loss": 0.0001,
"reward": 1.3125,
"reward_std": 0.3335031494498253,
"rewards/accuracy_reward": 0.59375,
"rewards/format_reward": 0.71875,
"step": 153
},
{
"clip_ratio": 0.0,
"completion_length": 305.9375,
"epoch": 0.32083333333333336,
"grad_norm": 5.122025012969971,
"kl": 0.132080078125,
"learning_rate": 8.930555555555556e-07,
"loss": 0.0001,
"reward": 1.28125,
"reward_std": 0.2630179077386856,
"rewards/accuracy_reward": 0.5625,
"rewards/format_reward": 0.71875,
"step": 154
},
{
"clip_ratio": 0.0,
"completion_length": 307.9375,
"epoch": 0.3229166666666667,
"grad_norm": 2.554211378097534,
"kl": 0.149169921875,
"learning_rate": 8.923611111111111e-07,
"loss": 0.0001,
"reward": 1.125,
"reward_std": 0.27439429610967636,
"rewards/accuracy_reward": 0.40625,
"rewards/format_reward": 0.71875,
"step": 155
},
{
"clip_ratio": 0.0,
"completion_length": 311.375,
"epoch": 0.325,
"grad_norm": 1.5763827562332153,
"kl": 0.1688232421875,
"learning_rate": 8.916666666666667e-07,
"loss": 0.0002,
"reward": 0.90625,
"reward_std": 0.1293872892856598,
"rewards/accuracy_reward": 0.40625,
"rewards/format_reward": 0.5,
"step": 156
},
{
"clip_ratio": 0.0,
"completion_length": 309.78125,
"epoch": 0.32708333333333334,
"grad_norm": 3.9709863662719727,
"kl": 0.120361328125,
"learning_rate": 8.909722222222222e-07,
"loss": 0.0001,
"reward": 1.625,
"reward_std": 0.5081327110528946,
"rewards/accuracy_reward": 0.6875,
"rewards/format_reward": 0.9375,
"step": 157
},
{
"clip_ratio": 0.0,
"completion_length": 316.0,
"epoch": 0.32916666666666666,
"grad_norm": 1.7929834127426147,
"kl": 0.1209716796875,
"learning_rate": 8.902777777777777e-07,
"loss": 0.0001,
"reward": 0.96875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.46875,
"rewards/format_reward": 0.5,
"step": 158
},
{
"clip_ratio": 0.0,
"completion_length": 282.28125,
"epoch": 0.33125,
"grad_norm": 5.112552165985107,
"kl": 0.143798828125,
"learning_rate": 8.895833333333332e-07,
"loss": 0.0001,
"reward": 1.40625,
"reward_std": 0.2041158601641655,
"rewards/accuracy_reward": 0.65625,
"rewards/format_reward": 0.75,
"step": 159
},
{
"clip_ratio": 0.0,
"completion_length": 328.4375,
"epoch": 0.3333333333333333,
"grad_norm": 2.3229613304138184,
"kl": 0.12841796875,
"learning_rate": 8.888888888888888e-07,
"loss": 0.0001,
"reward": 1.25,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.75,
"step": 160
},
{
"clip_ratio": 0.0,
"completion_length": 303.21875,
"epoch": 0.33541666666666664,
"grad_norm": 0.010016150772571564,
"kl": 0.124267578125,
"learning_rate": 8.881944444444443e-07,
"loss": 0.0001,
"reward": 1.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.5,
"step": 161
},
{
"clip_ratio": 0.0,
"completion_length": 312.65625,
"epoch": 0.3375,
"grad_norm": 2.9473917484283447,
"kl": 0.1375732421875,
"learning_rate": 8.874999999999999e-07,
"loss": 0.0001,
"reward": 1.375,
"reward_std": 0.2925042062997818,
"rewards/accuracy_reward": 0.625,
"rewards/format_reward": 0.75,
"step": 162
},
{
"clip_ratio": 0.0,
"completion_length": 308.3125,
"epoch": 0.33958333333333335,
"grad_norm": 19.425966262817383,
"kl": 0.19970703125,
"learning_rate": 8.868055555555555e-07,
"loss": 0.0002,
"reward": 0.9375,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.4375,
"rewards/format_reward": 0.5,
"step": 163
},
{
"clip_ratio": 0.0,
"completion_length": 316.28125,
"epoch": 0.3416666666666667,
"grad_norm": 1.3836629390716553,
"kl": 0.135986328125,
"learning_rate": 8.861111111111111e-07,
"loss": 0.0001,
"reward": 1.15625,
"reward_std": 0.1293872892856598,
"rewards/accuracy_reward": 0.40625,
"rewards/format_reward": 0.75,
"step": 164
},
{
"clip_ratio": 0.0,
"completion_length": 334.1875,
"epoch": 0.34375,
"grad_norm": 6.287445068359375,
"kl": 0.136474609375,
"learning_rate": 8.854166666666666e-07,
"loss": 0.0001,
"reward": 1.78125,
"reward_std": 0.3377464786171913,
"rewards/accuracy_reward": 0.78125,
"rewards/format_reward": 1.0,
"step": 165
},
{
"clip_ratio": 0.0,
"completion_length": 319.15625,
"epoch": 0.3458333333333333,
"grad_norm": 4.487782001495361,
"kl": 0.1461181640625,
"learning_rate": 8.847222222222222e-07,
"loss": 0.0001,
"reward": 1.625,
"reward_std": 0.4671337679028511,
"rewards/accuracy_reward": 0.65625,
"rewards/format_reward": 0.96875,
"step": 166
},
{
"clip_ratio": 0.0,
"completion_length": 300.625,
"epoch": 0.34791666666666665,
"grad_norm": 2.907824754714966,
"kl": 0.141845703125,
"learning_rate": 8.840277777777777e-07,
"loss": 0.0001,
"reward": 1.0625,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.3125,
"rewards/format_reward": 0.75,
"step": 167
},
{
"clip_ratio": 0.0,
"completion_length": 291.34375,
"epoch": 0.35,
"grad_norm": 2.528587818145752,
"kl": 0.146728515625,
"learning_rate": 8.833333333333333e-07,
"loss": 0.0001,
"reward": 1.34375,
"reward_std": 0.24511480331420898,
"rewards/accuracy_reward": 0.59375,
"rewards/format_reward": 0.75,
"step": 168
},
{
"clip_ratio": 0.0,
"completion_length": 290.1875,
"epoch": 0.35208333333333336,
"grad_norm": 0.007634375710040331,
"kl": 0.14111328125,
"learning_rate": 8.826388888888888e-07,
"loss": 0.0001,
"reward": 0.75,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.5,
"step": 169
},
{
"clip_ratio": 0.0,
"completion_length": 301.75,
"epoch": 0.3541666666666667,
"grad_norm": 0.009850732050836086,
"kl": 0.145263671875,
"learning_rate": 8.819444444444444e-07,
"loss": 0.0001,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.25,
"step": 170
},
{
"clip_ratio": 0.0,
"completion_length": 286.375,
"epoch": 0.35625,
"grad_norm": 4.6734089851379395,
"kl": 0.1787109375,
"learning_rate": 8.812499999999999e-07,
"loss": 0.0002,
"reward": 1.125,
"reward_std": 0.40089185535907745,
"rewards/accuracy_reward": 0.375,
"rewards/format_reward": 0.75,
"step": 171
},
{
"clip_ratio": 0.0,
"completion_length": 317.0,
"epoch": 0.35833333333333334,
"grad_norm": 1.5366886854171753,
"kl": 0.13427734375,
"learning_rate": 8.805555555555555e-07,
"loss": 0.0001,
"reward": 0.71875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.21875,
"rewards/format_reward": 0.5,
"step": 172
},
{
"clip_ratio": 0.0,
"completion_length": 286.5625,
"epoch": 0.36041666666666666,
"grad_norm": 0.022151868790388107,
"kl": 0.1435546875,
"learning_rate": 8.79861111111111e-07,
"loss": 0.0001,
"reward": 0.75,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.5,
"step": 173
},
{
"clip_ratio": 0.0,
"completion_length": 274.3125,
"epoch": 0.3625,
"grad_norm": 2.0597925186157227,
"kl": 0.1456298828125,
"learning_rate": 8.791666666666666e-07,
"loss": 0.0001,
"reward": 1.4375,
"reward_std": 0.1157275140285492,
"rewards/accuracy_reward": 0.6875,
"rewards/format_reward": 0.75,
"step": 174
},
{
"clip_ratio": 0.0,
"completion_length": 276.03125,
"epoch": 0.3645833333333333,
"grad_norm": 0.007576541043817997,
"kl": 0.160888671875,
"learning_rate": 8.784722222222221e-07,
"loss": 0.0002,
"reward": 0.5,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.5,
"step": 175
},
{
"clip_ratio": 0.0,
"completion_length": 267.4375,
"epoch": 0.36666666666666664,
"grad_norm": 4.791009902954102,
"kl": 0.166748046875,
"learning_rate": 8.777777777777777e-07,
"loss": 0.0002,
"reward": 1.25,
"reward_std": 0.3514062538743019,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 1.0,
"step": 176
},
{
"clip_ratio": 0.0,
"completion_length": 278.3125,
"epoch": 0.36875,
"grad_norm": 1.4421019554138184,
"kl": 0.330810546875,
"learning_rate": 8.770833333333333e-07,
"loss": 0.0003,
"reward": 1.34375,
"reward_std": 0.1293872892856598,
"rewards/accuracy_reward": 0.59375,
"rewards/format_reward": 0.75,
"step": 177
},
{
"clip_ratio": 0.0,
"completion_length": 274.6875,
"epoch": 0.37083333333333335,
"grad_norm": 13.741174697875977,
"kl": 0.174560546875,
"learning_rate": 8.763888888888889e-07,
"loss": 0.0002,
"reward": 1.34375,
"reward_std": 0.22201896458864212,
"rewards/accuracy_reward": 0.59375,
"rewards/format_reward": 0.75,
"step": 178
},
{
"clip_ratio": 0.0,
"completion_length": 293.71875,
"epoch": 0.3729166666666667,
"grad_norm": 2.762673854827881,
"kl": 0.158447265625,
"learning_rate": 8.756944444444444e-07,
"loss": 0.0002,
"reward": 1.25,
"reward_std": 0.3650856465101242,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.75,
"step": 179
},
{
"clip_ratio": 0.0,
"completion_length": 265.71875,
"epoch": 0.375,
"grad_norm": 3.762700319290161,
"kl": 0.16552734375,
"learning_rate": 8.75e-07,
"loss": 0.0002,
"reward": 1.6875,
"reward_std": 0.3514062538743019,
"rewards/accuracy_reward": 0.6875,
"rewards/format_reward": 1.0,
"step": 180
},
{
"clip_ratio": 0.0,
"completion_length": 312.59375,
"epoch": 0.3770833333333333,
"grad_norm": 2.4614808559417725,
"kl": 0.14794921875,
"learning_rate": 8.743055555555555e-07,
"loss": 0.0001,
"reward": 0.375,
"reward_std": 0.13363061845302582,
"rewards/accuracy_reward": 0.125,
"rewards/format_reward": 0.25,
"step": 181
},
{
"clip_ratio": 0.0,
"completion_length": 267.5,
"epoch": 0.37916666666666665,
"grad_norm": 2.320138692855835,
"kl": 0.169921875,
"learning_rate": 8.736111111111111e-07,
"loss": 0.0002,
"reward": 1.8125,
"reward_std": 0.249358132481575,
"rewards/accuracy_reward": 0.8125,
"rewards/format_reward": 1.0,
"step": 182
},
{
"clip_ratio": 0.0,
"completion_length": 267.46875,
"epoch": 0.38125,
"grad_norm": 1.7790272235870361,
"kl": 0.1593017578125,
"learning_rate": 8.729166666666666e-07,
"loss": 0.0002,
"reward": 1.6875,
"reward_std": 0.1157275140285492,
"rewards/accuracy_reward": 0.6875,
"rewards/format_reward": 1.0,
"step": 183
},
{
"clip_ratio": 0.0,
"completion_length": 280.5625,
"epoch": 0.38333333333333336,
"grad_norm": 2.4105911254882812,
"kl": 0.16015625,
"learning_rate": 8.722222222222222e-07,
"loss": 0.0002,
"reward": 0.84375,
"reward_std": 0.1293872892856598,
"rewards/accuracy_reward": 0.34375,
"rewards/format_reward": 0.5,
"step": 184
},
{
"clip_ratio": 0.0,
"completion_length": 265.84375,
"epoch": 0.3854166666666667,
"grad_norm": 2.337040424346924,
"kl": 0.190185546875,
"learning_rate": 8.715277777777777e-07,
"loss": 0.0002,
"reward": 1.375,
"reward_std": 0.2177756354212761,
"rewards/accuracy_reward": 0.625,
"rewards/format_reward": 0.75,
"step": 185
},
{
"clip_ratio": 0.0,
"completion_length": 276.59375,
"epoch": 0.3875,
"grad_norm": 5.802671909332275,
"kl": 0.20263671875,
"learning_rate": 8.708333333333333e-07,
"loss": 0.0002,
"reward": 1.65625,
"reward_std": 0.4218914955854416,
"rewards/accuracy_reward": 0.65625,
"rewards/format_reward": 1.0,
"step": 186
},
{
"clip_ratio": 0.0,
"completion_length": 256.8125,
"epoch": 0.38958333333333334,
"grad_norm": 2.9159724712371826,
"kl": 0.18212890625,
"learning_rate": 8.701388888888888e-07,
"loss": 0.0002,
"reward": 0.9375,
"reward_std": 0.1157275140285492,
"rewards/accuracy_reward": 0.4375,
"rewards/format_reward": 0.5,
"step": 187
},
{
"clip_ratio": 0.0,
"completion_length": 265.125,
"epoch": 0.39166666666666666,
"grad_norm": 21.03277015686035,
"kl": 0.178466796875,
"learning_rate": 8.694444444444444e-07,
"loss": 0.0002,
"reward": 0.90625,
"reward_std": 0.2041158601641655,
"rewards/accuracy_reward": 0.40625,
"rewards/format_reward": 0.5,
"step": 188
},
{
"clip_ratio": 0.0,
"completion_length": 276.0625,
"epoch": 0.39375,
"grad_norm": 1.9231356382369995,
"kl": 0.187255859375,
"learning_rate": 8.687499999999999e-07,
"loss": 0.0002,
"reward": 1.4375,
"reward_std": 0.1157275140285492,
"rewards/accuracy_reward": 0.6875,
"rewards/format_reward": 0.75,
"step": 189
},
{
"clip_ratio": 0.0,
"completion_length": 274.90625,
"epoch": 0.3958333333333333,
"grad_norm": 1.5890367031097412,
"kl": 0.1796875,
"learning_rate": 8.680555555555555e-07,
"loss": 0.0002,
"reward": 0.96875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.46875,
"rewards/format_reward": 0.5,
"step": 190
},
{
"clip_ratio": 0.0,
"completion_length": 282.5,
"epoch": 0.39791666666666664,
"grad_norm": 1.5505869388580322,
"kl": 0.163330078125,
"learning_rate": 8.673611111111111e-07,
"loss": 0.0002,
"reward": 1.46875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.71875,
"rewards/format_reward": 0.75,
"step": 191
},
{
"clip_ratio": 0.0,
"completion_length": 264.53125,
"epoch": 0.4,
"grad_norm": 0.04606321454048157,
"kl": 0.180419921875,
"learning_rate": 8.666666666666667e-07,
"loss": 0.0002,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.25,
"step": 192
},
{
"clip_ratio": 0.0,
"completion_length": 263.28125,
"epoch": 0.40208333333333335,
"grad_norm": 4.318572044372559,
"kl": 0.201171875,
"learning_rate": 8.659722222222222e-07,
"loss": 0.0002,
"reward": 1.46875,
"reward_std": 0.35564958304166794,
"rewards/accuracy_reward": 0.46875,
"rewards/format_reward": 1.0,
"step": 193
},
{
"clip_ratio": 0.0,
"completion_length": 263.34375,
"epoch": 0.4041666666666667,
"grad_norm": 2.215465784072876,
"kl": 0.185546875,
"learning_rate": 8.652777777777778e-07,
"loss": 0.0002,
"reward": 0.9375,
"reward_std": 0.1157275140285492,
"rewards/accuracy_reward": 0.4375,
"rewards/format_reward": 0.5,
"step": 194
},
{
"clip_ratio": 0.0,
"completion_length": 256.5625,
"epoch": 0.40625,
"grad_norm": 1.9885610342025757,
"kl": 0.22119140625,
"learning_rate": 8.645833333333333e-07,
"loss": 0.0002,
"reward": 1.8125,
"reward_std": 0.2177756354212761,
"rewards/accuracy_reward": 0.8125,
"rewards/format_reward": 1.0,
"step": 195
},
{
"clip_ratio": 0.0,
"completion_length": 286.5,
"epoch": 0.4083333333333333,
"grad_norm": 2.8268754482269287,
"kl": 0.1611328125,
"learning_rate": 8.638888888888889e-07,
"loss": 0.0002,
"reward": 1.84375,
"reward_std": 0.3061639815568924,
"rewards/accuracy_reward": 0.84375,
"rewards/format_reward": 1.0,
"step": 196
},
{
"clip_ratio": 0.0,
"completion_length": 285.09375,
"epoch": 0.41041666666666665,
"grad_norm": 3.276139497756958,
"kl": 0.173828125,
"learning_rate": 8.631944444444445e-07,
"loss": 0.0002,
"reward": 1.9375,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.9375,
"rewards/format_reward": 1.0,
"step": 197
},
{
"clip_ratio": 0.0,
"completion_length": 281.65625,
"epoch": 0.4125,
"grad_norm": 3.957500696182251,
"kl": 0.1904296875,
"learning_rate": 8.625e-07,
"loss": 0.0002,
"reward": 1.34375,
"reward_std": 0.24511480331420898,
"rewards/accuracy_reward": 0.59375,
"rewards/format_reward": 0.75,
"step": 198
},
{
"clip_ratio": 0.0,
"completion_length": 282.21875,
"epoch": 0.41458333333333336,
"grad_norm": 1.9258300065994263,
"kl": 0.17919921875,
"learning_rate": 8.618055555555556e-07,
"loss": 0.0002,
"reward": 1.09375,
"reward_std": 0.24511480331420898,
"rewards/accuracy_reward": 0.34375,
"rewards/format_reward": 0.75,
"step": 199
},
{
"clip_ratio": 0.0,
"completion_length": 303.71875,
"epoch": 0.4166666666666667,
"grad_norm": 1.656267523765564,
"kl": 0.1787109375,
"learning_rate": 8.611111111111111e-07,
"loss": 0.0002,
"reward": 0.9375,
"reward_std": 0.1157275140285492,
"rewards/accuracy_reward": 0.4375,
"rewards/format_reward": 0.5,
"step": 200
},
{
"clip_ratio": 0.0,
"completion_length": 281.96875,
"epoch": 0.41875,
"grad_norm": 3.227402687072754,
"kl": 0.18408203125,
"learning_rate": 8.604166666666667e-07,
"loss": 0.0002,
"reward": 1.4375,
"reward_std": 0.1157275140285492,
"rewards/accuracy_reward": 0.4375,
"rewards/format_reward": 1.0,
"step": 201
},
{
"clip_ratio": 0.0,
"completion_length": 294.28125,
"epoch": 0.42083333333333334,
"grad_norm": 1.7516613006591797,
"kl": 0.1708984375,
"learning_rate": 8.597222222222222e-07,
"loss": 0.0002,
"reward": 0.96875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.46875,
"rewards/format_reward": 0.5,
"step": 202
},
{
"clip_ratio": 0.0,
"completion_length": 319.21875,
"epoch": 0.42291666666666666,
"grad_norm": 2.2734436988830566,
"kl": 0.17041015625,
"learning_rate": 8.590277777777776e-07,
"loss": 0.0002,
"reward": 1.875,
"reward_std": 0.2314550280570984,
"rewards/accuracy_reward": 0.875,
"rewards/format_reward": 1.0,
"step": 203
},
{
"clip_ratio": 0.0,
"completion_length": 298.25,
"epoch": 0.425,
"grad_norm": 2.0509085655212402,
"kl": 0.18212890625,
"learning_rate": 8.583333333333332e-07,
"loss": 0.0002,
"reward": 0.78125,
"reward_std": 0.24511480331420898,
"rewards/accuracy_reward": 0.28125,
"rewards/format_reward": 0.5,
"step": 204
},
{
"clip_ratio": 0.0,
"completion_length": 300.25,
"epoch": 0.4270833333333333,
"grad_norm": 1.707233190536499,
"kl": 0.186279296875,
"learning_rate": 8.576388888888887e-07,
"loss": 0.0002,
"reward": 0.40625,
"reward_std": 0.1293872892856598,
"rewards/accuracy_reward": 0.15625,
"rewards/format_reward": 0.25,
"step": 205
},
{
"clip_ratio": 0.0,
"completion_length": 303.3125,
"epoch": 0.42916666666666664,
"grad_norm": 0.017311234027147293,
"kl": 0.20947265625,
"learning_rate": 8.569444444444444e-07,
"loss": 0.0002,
"reward": 1.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.5,
"step": 206
},
{
"clip_ratio": 0.0,
"completion_length": 307.0625,
"epoch": 0.43125,
"grad_norm": 5.802515506744385,
"kl": 0.167236328125,
"learning_rate": 8.5625e-07,
"loss": 0.0002,
"reward": 1.46875,
"reward_std": 0.4218914955854416,
"rewards/accuracy_reward": 0.46875,
"rewards/format_reward": 1.0,
"step": 207
},
{
"clip_ratio": 0.0,
"completion_length": 290.84375,
"epoch": 0.43333333333333335,
"grad_norm": 2.5334019660949707,
"kl": 0.179931640625,
"learning_rate": 8.555555555555555e-07,
"loss": 0.0002,
"reward": 0.71875,
"reward_std": 0.2041158601641655,
"rewards/accuracy_reward": 0.21875,
"rewards/format_reward": 0.5,
"step": 208
},
{
"clip_ratio": 0.0,
"completion_length": 305.15625,
"epoch": 0.4354166666666667,
"grad_norm": 2.551426649093628,
"kl": 0.1845703125,
"learning_rate": 8.548611111111111e-07,
"loss": 0.0002,
"reward": 1.53125,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.53125,
"rewards/format_reward": 1.0,
"step": 209
},
{
"clip_ratio": 0.0,
"completion_length": 325.5,
"epoch": 0.4375,
"grad_norm": 6.069310665130615,
"kl": 0.181640625,
"learning_rate": 8.541666666666666e-07,
"loss": 0.0002,
"reward": 1.6875,
"reward_std": 0.3745020925998688,
"rewards/accuracy_reward": 0.6875,
"rewards/format_reward": 1.0,
"step": 210
},
{
"clip_ratio": 0.0,
"completion_length": 357.09375,
"epoch": 0.4395833333333333,
"grad_norm": 3.099595308303833,
"kl": 0.1796875,
"learning_rate": 8.534722222222222e-07,
"loss": 0.0002,
"reward": 1.15625,
"reward_std": 0.3061639815568924,
"rewards/accuracy_reward": 0.4375,
"rewards/format_reward": 0.71875,
"step": 211
},
{
"clip_ratio": 0.0,
"completion_length": 323.59375,
"epoch": 0.44166666666666665,
"grad_norm": 2.682811737060547,
"kl": 0.1591796875,
"learning_rate": 8.527777777777777e-07,
"loss": 0.0002,
"reward": 1.25,
"reward_std": 0.2314550280570984,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.75,
"step": 212
},
{
"clip_ratio": 0.0,
"completion_length": 359.59375,
"epoch": 0.44375,
"grad_norm": 2.0939760208129883,
"kl": 0.18212890625,
"learning_rate": 8.520833333333333e-07,
"loss": 0.0002,
"reward": 1.3125,
"reward_std": 0.2177756354212761,
"rewards/accuracy_reward": 0.5625,
"rewards/format_reward": 0.75,
"step": 213
},
{
"clip_ratio": 0.0,
"completion_length": 354.375,
"epoch": 0.44583333333333336,
"grad_norm": 2.2327020168304443,
"kl": 0.18212890625,
"learning_rate": 8.513888888888888e-07,
"loss": 0.0002,
"reward": 1.59375,
"reward_std": 0.24511480331420898,
"rewards/accuracy_reward": 0.59375,
"rewards/format_reward": 1.0,
"step": 214
},
{
"clip_ratio": 0.0,
"completion_length": 383.71875,
"epoch": 0.4479166666666667,
"grad_norm": 2.284752607345581,
"kl": 0.157958984375,
"learning_rate": 8.506944444444444e-07,
"loss": 0.0002,
"reward": 0.84375,
"reward_std": 0.1293872892856598,
"rewards/accuracy_reward": 0.34375,
"rewards/format_reward": 0.5,
"step": 215
},
{
"clip_ratio": 0.0,
"completion_length": 393.8125,
"epoch": 0.45,
"grad_norm": 4.905982971191406,
"kl": 0.15869140625,
"learning_rate": 8.499999999999999e-07,
"loss": 0.0002,
"reward": 1.53125,
"reward_std": 0.3787454217672348,
"rewards/accuracy_reward": 0.53125,
"rewards/format_reward": 1.0,
"step": 216
},
{
"clip_ratio": 0.0,
"completion_length": 380.21875,
"epoch": 0.45208333333333334,
"grad_norm": 2.2356278896331787,
"kl": 0.15185546875,
"learning_rate": 8.493055555555555e-07,
"loss": 0.0002,
"reward": 1.96875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.96875,
"rewards/format_reward": 1.0,
"step": 217
},
{
"clip_ratio": 0.0,
"completion_length": 410.625,
"epoch": 0.45416666666666666,
"grad_norm": 0.00922483392059803,
"kl": 0.154541015625,
"learning_rate": 8.48611111111111e-07,
"loss": 0.0002,
"reward": 1.5,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.75,
"step": 218
},
{
"clip_ratio": 0.0,
"completion_length": 433.46875,
"epoch": 0.45625,
"grad_norm": 7.279197692871094,
"kl": 0.149658203125,
"learning_rate": 8.479166666666667e-07,
"loss": 0.0001,
"reward": 1.1875,
"reward_std": 0.2587745785713196,
"rewards/accuracy_reward": 0.4375,
"rewards/format_reward": 0.75,
"step": 219
},
{
"clip_ratio": 0.0,
"completion_length": 363.03125,
"epoch": 0.4583333333333333,
"grad_norm": 15.603084564208984,
"kl": 0.15869140625,
"learning_rate": 8.472222222222222e-07,
"loss": 0.0002,
"reward": 1.46875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.71875,
"rewards/format_reward": 0.75,
"step": 220
},
{
"clip_ratio": 0.0,
"completion_length": 404.78125,
"epoch": 0.46041666666666664,
"grad_norm": 1.14226233959198,
"kl": 0.1630859375,
"learning_rate": 8.465277777777778e-07,
"loss": 0.0002,
"reward": 1.34375,
"reward_std": 0.1293872892856598,
"rewards/accuracy_reward": 0.59375,
"rewards/format_reward": 0.75,
"step": 221
},
{
"clip_ratio": 0.0,
"completion_length": 394.78125,
"epoch": 0.4625,
"grad_norm": 3.340059280395508,
"kl": 0.1669921875,
"learning_rate": 8.458333333333333e-07,
"loss": 0.0002,
"reward": 1.09375,
"reward_std": 0.22201896458864212,
"rewards/accuracy_reward": 0.34375,
"rewards/format_reward": 0.75,
"step": 222
},
{
"clip_ratio": 0.0,
"completion_length": 369.375,
"epoch": 0.46458333333333335,
"grad_norm": 1.2038440704345703,
"kl": 0.175537109375,
"learning_rate": 8.451388888888889e-07,
"loss": 0.0002,
"reward": 0.90625,
"reward_std": 0.1293872892856598,
"rewards/accuracy_reward": 0.40625,
"rewards/format_reward": 0.5,
"step": 223
},
{
"clip_ratio": 0.0,
"completion_length": 413.53125,
"epoch": 0.4666666666666667,
"grad_norm": 3.947021722793579,
"kl": 0.165283203125,
"learning_rate": 8.444444444444444e-07,
"loss": 0.0002,
"reward": 1.1875,
"reward_std": 0.2925042062997818,
"rewards/accuracy_reward": 0.4375,
"rewards/format_reward": 0.75,
"step": 224
},
{
"clip_ratio": 0.0,
"completion_length": 392.5,
"epoch": 0.46875,
"grad_norm": 4.695667266845703,
"kl": 0.17724609375,
"learning_rate": 8.4375e-07,
"loss": 0.0002,
"reward": 1.90625,
"reward_std": 0.2651650384068489,
"rewards/accuracy_reward": 0.9375,
"rewards/format_reward": 0.96875,
"step": 225
},
{
"clip_ratio": 0.0,
"completion_length": 438.9375,
"epoch": 0.4708333333333333,
"grad_norm": 3.941527843475342,
"kl": 0.160888671875,
"learning_rate": 8.430555555555555e-07,
"loss": 0.0002,
"reward": 1.1875,
"reward_std": 0.3745020925998688,
"rewards/accuracy_reward": 0.4375,
"rewards/format_reward": 0.75,
"step": 226
},
{
"clip_ratio": 0.0,
"completion_length": 404.96875,
"epoch": 0.47291666666666665,
"grad_norm": 1.2121331691741943,
"kl": 0.177734375,
"learning_rate": 8.423611111111111e-07,
"loss": 0.0002,
"reward": 1.40625,
"reward_std": 0.1293872892856598,
"rewards/accuracy_reward": 0.65625,
"rewards/format_reward": 0.75,
"step": 227
},
{
"clip_ratio": 0.0,
"completion_length": 438.75,
"epoch": 0.475,
"grad_norm": 1.2622644901275635,
"kl": 0.17529296875,
"learning_rate": 8.416666666666666e-07,
"loss": 0.0002,
"reward": 0.8125,
"reward_std": 0.1157275140285492,
"rewards/accuracy_reward": 0.3125,
"rewards/format_reward": 0.5,
"step": 228
},
{
"clip_ratio": 0.0,
"completion_length": 429.15625,
"epoch": 0.47708333333333336,
"grad_norm": 2.040025472640991,
"kl": 0.167236328125,
"learning_rate": 8.409722222222222e-07,
"loss": 0.0002,
"reward": 0.9375,
"reward_std": 0.1157275140285492,
"rewards/accuracy_reward": 0.4375,
"rewards/format_reward": 0.5,
"step": 229
},
{
"clip_ratio": 0.0,
"completion_length": 448.0625,
"epoch": 0.4791666666666667,
"grad_norm": 2.3206212520599365,
"kl": 0.16357421875,
"learning_rate": 8.402777777777777e-07,
"loss": 0.0002,
"reward": 1.71875,
"reward_std": 0.3787454217672348,
"rewards/accuracy_reward": 0.71875,
"rewards/format_reward": 1.0,
"step": 230
},
{
"clip_ratio": 0.0,
"completion_length": 467.65625,
"epoch": 0.48125,
"grad_norm": 2.9016504287719727,
"kl": 0.17041015625,
"learning_rate": 8.395833333333333e-07,
"loss": 0.0002,
"reward": 0.71875,
"reward_std": 0.2041158601641655,
"rewards/accuracy_reward": 0.21875,
"rewards/format_reward": 0.5,
"step": 231
},
{
"clip_ratio": 0.0,
"completion_length": 477.4375,
"epoch": 0.48333333333333334,
"grad_norm": 0.07506023347377777,
"kl": 0.177001953125,
"learning_rate": 8.388888888888888e-07,
"loss": 0.0002,
"reward": 2.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 1.0,
"rewards/format_reward": 1.0,
"step": 232
},
{
"clip_ratio": 0.0,
"completion_length": 480.90625,
"epoch": 0.48541666666666666,
"grad_norm": 1.9703696966171265,
"kl": 0.1591796875,
"learning_rate": 8.381944444444445e-07,
"loss": 0.0002,
"reward": 1.4375,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.4375,
"rewards/format_reward": 1.0,
"step": 233
},
{
"clip_ratio": 0.0,
"completion_length": 456.03125,
"epoch": 0.4875,
"grad_norm": 1.3863449096679688,
"kl": 0.162109375,
"learning_rate": 8.375e-07,
"loss": 0.0002,
"reward": 1.3125,
"reward_std": 0.1157275140285492,
"rewards/accuracy_reward": 0.5625,
"rewards/format_reward": 0.75,
"step": 234
},
{
"clip_ratio": 0.0,
"completion_length": 461.75,
"epoch": 0.4895833333333333,
"grad_norm": 3.7418622970581055,
"kl": 0.15869140625,
"learning_rate": 8.368055555555556e-07,
"loss": 0.0002,
"reward": 1.8125,
"reward_std": 0.408231720328331,
"rewards/accuracy_reward": 0.8125,
"rewards/format_reward": 1.0,
"step": 235
},
{
"clip_ratio": 0.0,
"completion_length": 478.15625,
"epoch": 0.49166666666666664,
"grad_norm": 2.131373882293701,
"kl": 0.1455078125,
"learning_rate": 8.361111111111111e-07,
"loss": 0.0001,
"reward": 1.75,
"reward_std": 0.2587745785713196,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 1.0,
"step": 236
},
{
"clip_ratio": 0.0,
"completion_length": 455.0,
"epoch": 0.49375,
"grad_norm": 1.503204584121704,
"kl": 0.147705078125,
"learning_rate": 8.354166666666667e-07,
"loss": 0.0001,
"reward": 1.9375,
"reward_std": 0.1157275140285492,
"rewards/accuracy_reward": 0.9375,
"rewards/format_reward": 1.0,
"step": 237
},
{
"clip_ratio": 0.0,
"completion_length": 473.5625,
"epoch": 0.49583333333333335,
"grad_norm": 1.8451569080352783,
"kl": 0.14990234375,
"learning_rate": 8.347222222222222e-07,
"loss": 0.0001,
"reward": 1.25,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.75,
"step": 238
},
{
"clip_ratio": 0.0,
"completion_length": 514.875,
"epoch": 0.4979166666666667,
"grad_norm": 0.005814376752823591,
"kl": 0.1365966796875,
"learning_rate": 8.340277777777778e-07,
"loss": 0.0001,
"reward": 1.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.75,
"step": 239
},
{
"clip_ratio": 0.0,
"completion_length": 505.5625,
"epoch": 0.5,
"grad_norm": 2.293483257293701,
"kl": 0.149169921875,
"learning_rate": 8.333333333333333e-07,
"loss": 0.0001,
"reward": 1.875,
"reward_std": 0.2925042062997818,
"rewards/accuracy_reward": 0.875,
"rewards/format_reward": 1.0,
"step": 240
},
{
"clip_ratio": 0.0,
"completion_length": 552.78125,
"epoch": 0.5020833333333333,
"grad_norm": 1.321162462234497,
"kl": 0.13720703125,
"learning_rate": 8.326388888888889e-07,
"loss": 0.0001,
"reward": 1.9375,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.9375,
"rewards/format_reward": 1.0,
"step": 241
},
{
"clip_ratio": 0.0,
"completion_length": 505.28125,
"epoch": 0.5041666666666667,
"grad_norm": 2.4420886039733887,
"kl": 0.17236328125,
"learning_rate": 8.319444444444444e-07,
"loss": 0.0002,
"reward": 1.28125,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.53125,
"rewards/format_reward": 0.75,
"step": 242
},
{
"clip_ratio": 0.0,
"completion_length": 532.8125,
"epoch": 0.50625,
"grad_norm": 2.3468263149261475,
"kl": 0.162109375,
"learning_rate": 8.3125e-07,
"loss": 0.0002,
"reward": 0.96875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.46875,
"rewards/format_reward": 0.5,
"step": 243
},
{
"clip_ratio": 0.0,
"completion_length": 508.0,
"epoch": 0.5083333333333333,
"grad_norm": 2.658416509628296,
"kl": 0.148681640625,
"learning_rate": 8.305555555555555e-07,
"loss": 0.0001,
"reward": 1.375,
"reward_std": 0.2177756354212761,
"rewards/accuracy_reward": 0.625,
"rewards/format_reward": 0.75,
"step": 244
},
{
"clip_ratio": 0.0,
"completion_length": 548.59375,
"epoch": 0.5104166666666666,
"grad_norm": 2.731208324432373,
"kl": 0.137451171875,
"learning_rate": 8.298611111111111e-07,
"loss": 0.0001,
"reward": 1.4375,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.6875,
"rewards/format_reward": 0.75,
"step": 245
},
{
"clip_ratio": 0.0,
"completion_length": 537.75,
"epoch": 0.5125,
"grad_norm": 1.5126349925994873,
"kl": 0.13818359375,
"learning_rate": 8.291666666666666e-07,
"loss": 0.0001,
"reward": 1.46875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.71875,
"rewards/format_reward": 0.75,
"step": 246
},
{
"clip_ratio": 0.0,
"completion_length": 570.125,
"epoch": 0.5145833333333333,
"grad_norm": 0.9014095664024353,
"kl": 0.147216796875,
"learning_rate": 8.284722222222223e-07,
"loss": 0.0001,
"reward": 1.46875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.71875,
"rewards/format_reward": 0.75,
"step": 247
},
{
"clip_ratio": 0.0,
"completion_length": 569.03125,
"epoch": 0.5166666666666667,
"grad_norm": 4.276453018188477,
"kl": 0.1531982421875,
"learning_rate": 8.277777777777777e-07,
"loss": 0.0002,
"reward": 1.65625,
"reward_std": 0.3787454217672348,
"rewards/accuracy_reward": 0.65625,
"rewards/format_reward": 1.0,
"step": 248
},
{
"clip_ratio": 0.0,
"completion_length": 532.28125,
"epoch": 0.51875,
"grad_norm": 1.9292553663253784,
"kl": 0.1513671875,
"learning_rate": 8.270833333333333e-07,
"loss": 0.0002,
"reward": 1.0625,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.3125,
"rewards/format_reward": 0.75,
"step": 249
},
{
"clip_ratio": 0.0,
"completion_length": 567.3125,
"epoch": 0.5208333333333334,
"grad_norm": 0.9851927161216736,
"kl": 0.151123046875,
"learning_rate": 8.263888888888888e-07,
"loss": 0.0002,
"reward": 1.1875,
"reward_std": 0.1157275140285492,
"rewards/accuracy_reward": 0.4375,
"rewards/format_reward": 0.75,
"step": 250
},
{
"clip_ratio": 0.0,
"completion_length": 552.25,
"epoch": 0.5229166666666667,
"grad_norm": 0.006976987235248089,
"kl": 0.158447265625,
"learning_rate": 8.256944444444444e-07,
"loss": 0.0002,
"reward": 1.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.5,
"step": 251
},
{
"clip_ratio": 0.0,
"completion_length": 511.84375,
"epoch": 0.525,
"grad_norm": 1.9051716327667236,
"kl": 0.150146484375,
"learning_rate": 8.249999999999999e-07,
"loss": 0.0002,
"reward": 1.84375,
"reward_std": 0.24511480331420898,
"rewards/accuracy_reward": 0.84375,
"rewards/format_reward": 1.0,
"step": 252
},
{
"clip_ratio": 0.0,
"completion_length": 629.25,
"epoch": 0.5270833333333333,
"grad_norm": 1.9278944730758667,
"kl": 0.143798828125,
"learning_rate": 8.243055555555555e-07,
"loss": 0.0001,
"reward": 0.78125,
"reward_std": 0.2041158601641655,
"rewards/accuracy_reward": 0.28125,
"rewards/format_reward": 0.5,
"step": 253
},
{
"clip_ratio": 0.0,
"completion_length": 527.25,
"epoch": 0.5291666666666667,
"grad_norm": 1.7085317373275757,
"kl": 0.160888671875,
"learning_rate": 8.23611111111111e-07,
"loss": 0.0002,
"reward": 1.40625,
"reward_std": 0.1293872892856598,
"rewards/accuracy_reward": 0.65625,
"rewards/format_reward": 0.75,
"step": 254
},
{
"clip_ratio": 0.0,
"completion_length": 549.59375,
"epoch": 0.53125,
"grad_norm": 0.015542632900178432,
"kl": 0.159423828125,
"learning_rate": 8.229166666666666e-07,
"loss": 0.0002,
"reward": 0.75,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.5,
"step": 255
},
{
"clip_ratio": 0.0,
"completion_length": 546.15625,
"epoch": 0.5333333333333333,
"grad_norm": 2.3451013565063477,
"kl": 0.15869140625,
"learning_rate": 8.222222222222221e-07,
"loss": 0.0002,
"reward": 1.71875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.71875,
"rewards/format_reward": 1.0,
"step": 256
},
{
"clip_ratio": 0.0,
"completion_length": 531.0,
"epoch": 0.5354166666666667,
"grad_norm": 2.0487141609191895,
"kl": 0.140869140625,
"learning_rate": 8.215277777777777e-07,
"loss": 0.0001,
"reward": 1.40625,
"reward_std": 0.2041158601641655,
"rewards/accuracy_reward": 0.65625,
"rewards/format_reward": 0.75,
"step": 257
},
{
"clip_ratio": 0.0,
"completion_length": 562.46875,
"epoch": 0.5375,
"grad_norm": 1.092297911643982,
"kl": 0.16064453125,
"learning_rate": 8.208333333333332e-07,
"loss": 0.0002,
"reward": 0.46875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.21875,
"rewards/format_reward": 0.25,
"step": 258
},
{
"clip_ratio": 0.0,
"completion_length": 537.3125,
"epoch": 0.5395833333333333,
"grad_norm": 1.0972661972045898,
"kl": 0.15966796875,
"learning_rate": 8.201388888888888e-07,
"loss": 0.0002,
"reward": 0.9375,
"reward_std": 0.1157275140285492,
"rewards/accuracy_reward": 0.4375,
"rewards/format_reward": 0.5,
"step": 259
},
{
"clip_ratio": 0.0,
"completion_length": 578.96875,
"epoch": 0.5416666666666666,
"grad_norm": 0.0056478967890143394,
"kl": 0.149658203125,
"learning_rate": 8.194444444444443e-07,
"loss": 0.0001,
"reward": 0.5,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.25,
"step": 260
},
{
"clip_ratio": 0.0,
"completion_length": 592.65625,
"epoch": 0.54375,
"grad_norm": 1.2728099822998047,
"kl": 0.14697265625,
"learning_rate": 8.187499999999999e-07,
"loss": 0.0001,
"reward": 0.9375,
"reward_std": 0.1157275140285492,
"rewards/accuracy_reward": 0.4375,
"rewards/format_reward": 0.5,
"step": 261
},
{
"clip_ratio": 0.0,
"completion_length": 536.5625,
"epoch": 0.5458333333333333,
"grad_norm": 1.4108928442001343,
"kl": 0.162353515625,
"learning_rate": 8.180555555555555e-07,
"loss": 0.0002,
"reward": 1.375,
"reward_std": 0.2177756354212761,
"rewards/accuracy_reward": 0.625,
"rewards/format_reward": 0.75,
"step": 262
},
{
"clip_ratio": 0.0,
"completion_length": 580.46875,
"epoch": 0.5479166666666667,
"grad_norm": 0.014726191759109497,
"kl": 0.161865234375,
"learning_rate": 8.173611111111111e-07,
"loss": 0.0002,
"reward": 0.5,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.25,
"step": 263
},
{
"clip_ratio": 0.0,
"completion_length": 562.375,
"epoch": 0.55,
"grad_norm": 1.6673963069915771,
"kl": 0.157470703125,
"learning_rate": 8.166666666666666e-07,
"loss": 0.0002,
"reward": 1.0,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.75,
"step": 264
},
{
"clip_ratio": 0.0,
"completion_length": 582.0,
"epoch": 0.5520833333333334,
"grad_norm": 3.3869149684906006,
"kl": 0.143798828125,
"learning_rate": 8.159722222222222e-07,
"loss": 0.0001,
"reward": 1.5,
"reward_std": 0.3514062538743019,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 1.0,
"step": 265
},
{
"clip_ratio": 0.0,
"completion_length": 551.21875,
"epoch": 0.5541666666666667,
"grad_norm": 1.9510457515716553,
"kl": 0.192626953125,
"learning_rate": 8.152777777777777e-07,
"loss": 0.0002,
"reward": 1.1875,
"reward_std": 0.1157275140285492,
"rewards/accuracy_reward": 0.4375,
"rewards/format_reward": 0.75,
"step": 266
},
{
"clip_ratio": 0.0,
"completion_length": 526.03125,
"epoch": 0.55625,
"grad_norm": 1.3092948198318481,
"kl": 0.175537109375,
"learning_rate": 8.145833333333333e-07,
"loss": 0.0002,
"reward": 0.375,
"reward_std": 0.13363061845302582,
"rewards/accuracy_reward": 0.125,
"rewards/format_reward": 0.25,
"step": 267
},
{
"clip_ratio": 0.0,
"completion_length": 506.53125,
"epoch": 0.5583333333333333,
"grad_norm": 0.09412389248609543,
"kl": 0.2783203125,
"learning_rate": 8.138888888888888e-07,
"loss": 0.0003,
"reward": 0.5,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.25,
"step": 268
},
{
"clip_ratio": 0.0,
"completion_length": 522.46875,
"epoch": 0.5604166666666667,
"grad_norm": 1.6767691373825073,
"kl": 0.16552734375,
"learning_rate": 8.131944444444444e-07,
"loss": 0.0002,
"reward": 1.78125,
"reward_std": 0.2041158601641655,
"rewards/accuracy_reward": 0.78125,
"rewards/format_reward": 1.0,
"step": 269
},
{
"clip_ratio": 0.0,
"completion_length": 514.8125,
"epoch": 0.5625,
"grad_norm": 1.0303244590759277,
"kl": 0.160888671875,
"learning_rate": 8.125e-07,
"loss": 0.0002,
"reward": 1.46875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.71875,
"rewards/format_reward": 0.75,
"step": 270
},
{
"clip_ratio": 0.0,
"completion_length": 503.5625,
"epoch": 0.5645833333333333,
"grad_norm": 3.553706169128418,
"kl": 0.169189453125,
"learning_rate": 8.118055555555555e-07,
"loss": 0.0002,
"reward": 1.65625,
"reward_std": 0.2041158601641655,
"rewards/accuracy_reward": 0.6875,
"rewards/format_reward": 0.96875,
"step": 271
},
{
"clip_ratio": 0.0,
"completion_length": 533.65625,
"epoch": 0.5666666666666667,
"grad_norm": 1.0599257946014404,
"kl": 0.183837890625,
"learning_rate": 8.11111111111111e-07,
"loss": 0.0002,
"reward": 1.0625,
"reward_std": 0.1157275140285492,
"rewards/accuracy_reward": 0.3125,
"rewards/format_reward": 0.75,
"step": 272
},
{
"clip_ratio": 0.0,
"completion_length": 518.90625,
"epoch": 0.56875,
"grad_norm": 2.1599457263946533,
"kl": 0.187744140625,
"learning_rate": 8.104166666666666e-07,
"loss": 0.0002,
"reward": 0.875,
"reward_std": 0.2314550280570984,
"rewards/accuracy_reward": 0.375,
"rewards/format_reward": 0.5,
"step": 273
},
{
"clip_ratio": 0.0,
"completion_length": 502.65625,
"epoch": 0.5708333333333333,
"grad_norm": 4.606043338775635,
"kl": 0.165771484375,
"learning_rate": 8.097222222222222e-07,
"loss": 0.0002,
"reward": 0.96875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.46875,
"rewards/format_reward": 0.5,
"step": 274
},
{
"clip_ratio": 0.0,
"completion_length": 454.125,
"epoch": 0.5729166666666666,
"grad_norm": 1.3466901779174805,
"kl": 0.181396484375,
"learning_rate": 8.090277777777777e-07,
"loss": 0.0002,
"reward": 1.1875,
"reward_std": 0.1157275140285492,
"rewards/accuracy_reward": 0.4375,
"rewards/format_reward": 0.75,
"step": 275
},
{
"clip_ratio": 0.0,
"completion_length": 494.65625,
"epoch": 0.575,
"grad_norm": 0.010240813717246056,
"kl": 0.19287109375,
"learning_rate": 8.083333333333334e-07,
"loss": 0.0002,
"reward": 0.75,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.5,
"step": 276
},
{
"clip_ratio": 0.0,
"completion_length": 504.8125,
"epoch": 0.5770833333333333,
"grad_norm": 12.152362823486328,
"kl": 0.1728515625,
"learning_rate": 8.076388888888889e-07,
"loss": 0.0002,
"reward": 1.5,
"reward_std": 0.3650856465101242,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 1.0,
"step": 277
},
{
"clip_ratio": 0.0,
"completion_length": 483.96875,
"epoch": 0.5791666666666667,
"grad_norm": 11.584708213806152,
"kl": 0.1640625,
"learning_rate": 8.069444444444445e-07,
"loss": 0.0002,
"reward": 1.21875,
"reward_std": 0.2651650384068489,
"rewards/accuracy_reward": 0.46875,
"rewards/format_reward": 0.75,
"step": 278
},
{
"clip_ratio": 0.0,
"completion_length": 524.0,
"epoch": 0.58125,
"grad_norm": 1.5547815561294556,
"kl": 0.181640625,
"learning_rate": 8.0625e-07,
"loss": 0.0002,
"reward": 1.21875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.46875,
"rewards/format_reward": 0.75,
"step": 279
},
{
"clip_ratio": 0.0,
"completion_length": 482.96875,
"epoch": 0.5833333333333334,
"grad_norm": 1.335976004600525,
"kl": 0.177490234375,
"learning_rate": 8.055555555555556e-07,
"loss": 0.0002,
"reward": 0.46875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.21875,
"rewards/format_reward": 0.25,
"step": 280
},
{
"clip_ratio": 0.0,
"completion_length": 492.96875,
"epoch": 0.5854166666666667,
"grad_norm": 2.280996322631836,
"kl": 0.189208984375,
"learning_rate": 8.048611111111111e-07,
"loss": 0.0002,
"reward": 0.6875,
"reward_std": 0.2587745785713196,
"rewards/accuracy_reward": 0.1875,
"rewards/format_reward": 0.5,
"step": 281
},
{
"clip_ratio": 0.0,
"completion_length": 507.03125,
"epoch": 0.5875,
"grad_norm": 1.1769981384277344,
"kl": 0.16650390625,
"learning_rate": 8.041666666666667e-07,
"loss": 0.0002,
"reward": 0.9375,
"reward_std": 0.1157275140285492,
"rewards/accuracy_reward": 0.1875,
"rewards/format_reward": 0.75,
"step": 282
},
{
"clip_ratio": 0.0,
"completion_length": 537.75,
"epoch": 0.5895833333333333,
"grad_norm": 1.1851226091384888,
"kl": 0.1767578125,
"learning_rate": 8.034722222222222e-07,
"loss": 0.0002,
"reward": 1.84375,
"reward_std": 0.1293872892856598,
"rewards/accuracy_reward": 0.84375,
"rewards/format_reward": 1.0,
"step": 283
},
{
"clip_ratio": 0.0,
"completion_length": 514.65625,
"epoch": 0.5916666666666667,
"grad_norm": 1.5428513288497925,
"kl": 0.178955078125,
"learning_rate": 8.027777777777778e-07,
"loss": 0.0002,
"reward": 1.1875,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.4375,
"rewards/format_reward": 0.75,
"step": 284
},
{
"clip_ratio": 0.0,
"completion_length": 527.53125,
"epoch": 0.59375,
"grad_norm": 0.058219242841005325,
"kl": 0.23583984375,
"learning_rate": 8.020833333333333e-07,
"loss": 0.0002,
"reward": 1.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 1.0,
"step": 285
},
{
"clip_ratio": 0.0,
"completion_length": 487.15625,
"epoch": 0.5958333333333333,
"grad_norm": 1.0132204294204712,
"kl": 0.181884765625,
"learning_rate": 8.013888888888889e-07,
"loss": 0.0002,
"reward": 1.96875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.96875,
"rewards/format_reward": 1.0,
"step": 286
},
{
"clip_ratio": 0.0,
"completion_length": 514.4375,
"epoch": 0.5979166666666667,
"grad_norm": 0.008692107163369656,
"kl": 0.1728515625,
"learning_rate": 8.006944444444444e-07,
"loss": 0.0002,
"reward": 1.75,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 1.0,
"step": 287
},
{
"clip_ratio": 0.0,
"completion_length": 480.15625,
"epoch": 0.6,
"grad_norm": 0.03367244824767113,
"kl": 0.18310546875,
"learning_rate": 8e-07,
"loss": 0.0002,
"reward": 0.75,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.5,
"step": 288
},
{
"clip_ratio": 0.0,
"completion_length": 509.3125,
"epoch": 0.6020833333333333,
"grad_norm": 0.008252977393567562,
"kl": 0.1708984375,
"learning_rate": 7.993055555555555e-07,
"loss": 0.0002,
"reward": 1.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.75,
"step": 289
},
{
"clip_ratio": 0.0,
"completion_length": 488.375,
"epoch": 0.6041666666666666,
"grad_norm": 2.225816249847412,
"kl": 0.173095703125,
"learning_rate": 7.986111111111112e-07,
"loss": 0.0002,
"reward": 1.53125,
"reward_std": 0.24511480331420898,
"rewards/accuracy_reward": 0.53125,
"rewards/format_reward": 1.0,
"step": 290
},
{
"clip_ratio": 0.0,
"completion_length": 531.0625,
"epoch": 0.60625,
"grad_norm": 0.021306902170181274,
"kl": 0.166259765625,
"learning_rate": 7.979166666666667e-07,
"loss": 0.0002,
"reward": 0.5,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.5,
"step": 291
},
{
"clip_ratio": 0.0,
"completion_length": 535.875,
"epoch": 0.6083333333333333,
"grad_norm": 1.042975664138794,
"kl": 0.16015625,
"learning_rate": 7.972222222222223e-07,
"loss": 0.0002,
"reward": 1.65625,
"reward_std": 0.1293872892856598,
"rewards/accuracy_reward": 0.65625,
"rewards/format_reward": 1.0,
"step": 292
},
{
"clip_ratio": 0.0,
"completion_length": 499.875,
"epoch": 0.6104166666666667,
"grad_norm": 1.670255184173584,
"kl": 0.168212890625,
"learning_rate": 7.965277777777777e-07,
"loss": 0.0002,
"reward": 0.875,
"reward_std": 0.2177756354212761,
"rewards/accuracy_reward": 0.375,
"rewards/format_reward": 0.5,
"step": 293
},
{
"clip_ratio": 0.0,
"completion_length": 484.09375,
"epoch": 0.6125,
"grad_norm": 1.954439640045166,
"kl": 0.171142578125,
"learning_rate": 7.958333333333333e-07,
"loss": 0.0002,
"reward": 1.375,
"reward_std": 0.2177756354212761,
"rewards/accuracy_reward": 0.625,
"rewards/format_reward": 0.75,
"step": 294
},
{
"clip_ratio": 0.0,
"completion_length": 519.59375,
"epoch": 0.6145833333333334,
"grad_norm": 1.7601760625839233,
"kl": 0.1767578125,
"learning_rate": 7.951388888888888e-07,
"loss": 0.0002,
"reward": 1.90625,
"reward_std": 0.2041158601641655,
"rewards/accuracy_reward": 0.90625,
"rewards/format_reward": 1.0,
"step": 295
},
{
"clip_ratio": 0.0,
"completion_length": 491.34375,
"epoch": 0.6166666666666667,
"grad_norm": 10.422201156616211,
"kl": 0.1806640625,
"learning_rate": 7.944444444444444e-07,
"loss": 0.0002,
"reward": 0.90625,
"reward_std": 0.2041158601641655,
"rewards/accuracy_reward": 0.40625,
"rewards/format_reward": 0.5,
"step": 296
},
{
"clip_ratio": 0.0,
"completion_length": 509.375,
"epoch": 0.61875,
"grad_norm": 0.028056718409061432,
"kl": 0.180419921875,
"learning_rate": 7.937499999999999e-07,
"loss": 0.0002,
"reward": 1.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 1.0,
"step": 297
},
{
"clip_ratio": 0.0,
"completion_length": 482.65625,
"epoch": 0.6208333333333333,
"grad_norm": 1.2360841035842896,
"kl": 0.1806640625,
"learning_rate": 7.930555555555555e-07,
"loss": 0.0002,
"reward": 0.90625,
"reward_std": 0.1293872892856598,
"rewards/accuracy_reward": 0.40625,
"rewards/format_reward": 0.5,
"step": 298
},
{
"clip_ratio": 0.0,
"completion_length": 421.71875,
"epoch": 0.6229166666666667,
"grad_norm": 2.3312301635742188,
"kl": 0.180419921875,
"learning_rate": 7.92361111111111e-07,
"loss": 0.0002,
"reward": 1.75,
"reward_std": 0.3335031494498253,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 1.0,
"step": 299
},
{
"clip_ratio": 0.0,
"completion_length": 493.3125,
"epoch": 0.625,
"grad_norm": 0.00832283217459917,
"kl": 0.1640625,
"learning_rate": 7.916666666666666e-07,
"loss": 0.0002,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"step": 300
},
{
"clip_ratio": 0.0,
"completion_length": 462.59375,
"epoch": 0.6270833333333333,
"grad_norm": 1.8436075448989868,
"kl": 0.178466796875,
"learning_rate": 7.909722222222221e-07,
"loss": 0.0002,
"reward": 1.40625,
"reward_std": 0.2041158601641655,
"rewards/accuracy_reward": 0.6875,
"rewards/format_reward": 0.71875,
"step": 301
},
{
"clip_ratio": 0.0,
"completion_length": 450.21875,
"epoch": 0.6291666666666667,
"grad_norm": 0.011904319748282433,
"kl": 0.176025390625,
"learning_rate": 7.902777777777777e-07,
"loss": 0.0002,
"reward": 0.5,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.25,
"step": 302
},
{
"clip_ratio": 0.0,
"completion_length": 485.90625,
"epoch": 0.63125,
"grad_norm": 1.144376516342163,
"kl": 0.183837890625,
"learning_rate": 7.895833333333332e-07,
"loss": 0.0002,
"reward": 1.34375,
"reward_std": 0.1293872892856598,
"rewards/accuracy_reward": 0.59375,
"rewards/format_reward": 0.75,
"step": 303
},
{
"clip_ratio": 0.0,
"completion_length": 467.84375,
"epoch": 0.6333333333333333,
"grad_norm": 3.746612310409546,
"kl": 0.182373046875,
"learning_rate": 7.888888888888889e-07,
"loss": 0.0002,
"reward": 1.375,
"reward_std": 0.2177756354212761,
"rewards/accuracy_reward": 0.625,
"rewards/format_reward": 0.75,
"step": 304
},
{
"clip_ratio": 0.0,
"completion_length": 480.9375,
"epoch": 0.6354166666666666,
"grad_norm": 2.1090877056121826,
"kl": 0.1728515625,
"learning_rate": 7.881944444444444e-07,
"loss": 0.0002,
"reward": 0.75,
"reward_std": 0.13363061845302582,
"rewards/accuracy_reward": 0.28125,
"rewards/format_reward": 0.46875,
"step": 305
},
{
"clip_ratio": 0.0,
"completion_length": 449.40625,
"epoch": 0.6375,
"grad_norm": 0.009726927615702152,
"kl": 0.17724609375,
"learning_rate": 7.875e-07,
"loss": 0.0002,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.25,
"step": 306
},
{
"clip_ratio": 0.0,
"completion_length": 452.75,
"epoch": 0.6395833333333333,
"grad_norm": 3.932494640350342,
"kl": 0.184326171875,
"learning_rate": 7.868055555555555e-07,
"loss": 0.0002,
"reward": 1.9375,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.9375,
"rewards/format_reward": 1.0,
"step": 307
},
{
"clip_ratio": 0.0,
"completion_length": 428.875,
"epoch": 0.6416666666666667,
"grad_norm": 2.0402674674987793,
"kl": 0.180908203125,
"learning_rate": 7.861111111111111e-07,
"loss": 0.0002,
"reward": 1.875,
"reward_std": 0.2177756354212761,
"rewards/accuracy_reward": 0.875,
"rewards/format_reward": 1.0,
"step": 308
},
{
"clip_ratio": 0.0,
"completion_length": 451.625,
"epoch": 0.64375,
"grad_norm": 2.2270169258117676,
"kl": 0.180908203125,
"learning_rate": 7.854166666666666e-07,
"loss": 0.0002,
"reward": 0.3125,
"reward_std": 0.1157275140285492,
"rewards/accuracy_reward": 0.0625,
"rewards/format_reward": 0.25,
"step": 309
},
{
"clip_ratio": 0.0,
"completion_length": 417.34375,
"epoch": 0.6458333333333334,
"grad_norm": 2.4076709747314453,
"kl": 0.183837890625,
"learning_rate": 7.847222222222222e-07,
"loss": 0.0002,
"reward": 1.8125,
"reward_std": 0.3335031494498253,
"rewards/accuracy_reward": 0.8125,
"rewards/format_reward": 1.0,
"step": 310
},
{
"clip_ratio": 0.0,
"completion_length": 440.4375,
"epoch": 0.6479166666666667,
"grad_norm": 1.1454743146896362,
"kl": 0.185791015625,
"learning_rate": 7.840277777777777e-07,
"loss": 0.0002,
"reward": 0.96875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.46875,
"rewards/format_reward": 0.5,
"step": 311
},
{
"clip_ratio": 0.0,
"completion_length": 413.1875,
"epoch": 0.65,
"grad_norm": 2.555177927017212,
"kl": 0.1884765625,
"learning_rate": 7.833333333333333e-07,
"loss": 0.0002,
"reward": 1.375,
"reward_std": 0.2925042062997818,
"rewards/accuracy_reward": 0.625,
"rewards/format_reward": 0.75,
"step": 312
},
{
"clip_ratio": 0.0,
"completion_length": 437.09375,
"epoch": 0.6520833333333333,
"grad_norm": 1.292601227760315,
"kl": 0.180419921875,
"learning_rate": 7.826388888888888e-07,
"loss": 0.0002,
"reward": 1.1875,
"reward_std": 0.1157275140285492,
"rewards/accuracy_reward": 0.4375,
"rewards/format_reward": 0.75,
"step": 313
},
{
"clip_ratio": 0.0,
"completion_length": 426.5625,
"epoch": 0.6541666666666667,
"grad_norm": 6.739890098571777,
"kl": 0.18212890625,
"learning_rate": 7.819444444444444e-07,
"loss": 0.0002,
"reward": 1.625,
"reward_std": 0.3104073107242584,
"rewards/accuracy_reward": 0.625,
"rewards/format_reward": 1.0,
"step": 314
},
{
"clip_ratio": 0.0,
"completion_length": 384.65625,
"epoch": 0.65625,
"grad_norm": 1.3566937446594238,
"kl": 0.197998046875,
"learning_rate": 7.812499999999999e-07,
"loss": 0.0002,
"reward": 1.46875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.71875,
"rewards/format_reward": 0.75,
"step": 315
},
{
"clip_ratio": 0.0,
"completion_length": 426.09375,
"epoch": 0.6583333333333333,
"grad_norm": 1.7616151571273804,
"kl": 0.17529296875,
"learning_rate": 7.805555555555555e-07,
"loss": 0.0002,
"reward": 1.34375,
"reward_std": 0.1293872892856598,
"rewards/accuracy_reward": 0.59375,
"rewards/format_reward": 0.75,
"step": 316
},
{
"clip_ratio": 0.0,
"completion_length": 419.5,
"epoch": 0.6604166666666667,
"grad_norm": 1.9657090902328491,
"kl": 0.206298828125,
"learning_rate": 7.79861111111111e-07,
"loss": 0.0002,
"reward": 1.28125,
"reward_std": 0.2041158601641655,
"rewards/accuracy_reward": 0.53125,
"rewards/format_reward": 0.75,
"step": 317
},
{
"clip_ratio": 0.0,
"completion_length": 403.21875,
"epoch": 0.6625,
"grad_norm": 3.3363990783691406,
"kl": 0.18115234375,
"learning_rate": 7.791666666666667e-07,
"loss": 0.0002,
"reward": 1.15625,
"reward_std": 0.22201896458864212,
"rewards/accuracy_reward": 0.40625,
"rewards/format_reward": 0.75,
"step": 318
},
{
"clip_ratio": 0.0,
"completion_length": 403.9375,
"epoch": 0.6645833333333333,
"grad_norm": 1.5157190561294556,
"kl": 0.207763671875,
"learning_rate": 7.784722222222222e-07,
"loss": 0.0002,
"reward": 1.4375,
"reward_std": 0.1157275140285492,
"rewards/accuracy_reward": 0.6875,
"rewards/format_reward": 0.75,
"step": 319
},
{
"clip_ratio": 0.0,
"completion_length": 428.46875,
"epoch": 0.6666666666666666,
"grad_norm": 0.01322422455996275,
"kl": 0.200439453125,
"learning_rate": 7.777777777777778e-07,
"loss": 0.0002,
"reward": 1.5,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.75,
"step": 320
},
{
"clip_ratio": 0.0,
"completion_length": 401.5,
"epoch": 0.66875,
"grad_norm": 1.8207961320877075,
"kl": 0.1865234375,
"learning_rate": 7.770833333333333e-07,
"loss": 0.0002,
"reward": 1.21875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.46875,
"rewards/format_reward": 0.75,
"step": 321
},
{
"clip_ratio": 0.0,
"completion_length": 431.15625,
"epoch": 0.6708333333333333,
"grad_norm": 2.6609857082366943,
"kl": 0.181884765625,
"learning_rate": 7.763888888888889e-07,
"loss": 0.0002,
"reward": 1.96875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.96875,
"rewards/format_reward": 1.0,
"step": 322
},
{
"clip_ratio": 0.0,
"completion_length": 413.71875,
"epoch": 0.6729166666666667,
"grad_norm": 0.01364789716899395,
"kl": 0.18896484375,
"learning_rate": 7.756944444444444e-07,
"loss": 0.0002,
"reward": 1.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.5,
"step": 323
},
{
"clip_ratio": 0.0,
"completion_length": 438.25,
"epoch": 0.675,
"grad_norm": 1.3705228567123413,
"kl": 0.185546875,
"learning_rate": 7.75e-07,
"loss": 0.0002,
"reward": 1.46875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.71875,
"rewards/format_reward": 0.75,
"step": 324
},
{
"clip_ratio": 0.0,
"completion_length": 437.90625,
"epoch": 0.6770833333333334,
"grad_norm": 1.159696340560913,
"kl": 0.17724609375,
"learning_rate": 7.743055555555555e-07,
"loss": 0.0002,
"reward": 1.15625,
"reward_std": 0.1293872892856598,
"rewards/accuracy_reward": 0.40625,
"rewards/format_reward": 0.75,
"step": 325
},
{
"clip_ratio": 0.0,
"completion_length": 441.28125,
"epoch": 0.6791666666666667,
"grad_norm": 0.011270968243479729,
"kl": 0.187744140625,
"learning_rate": 7.736111111111111e-07,
"loss": 0.0002,
"reward": 0.75,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.5,
"step": 326
},
{
"clip_ratio": 0.0,
"completion_length": 431.40625,
"epoch": 0.68125,
"grad_norm": 2.9195308685302734,
"kl": 0.18310546875,
"learning_rate": 7.729166666666666e-07,
"loss": 0.0002,
"reward": 1.40625,
"reward_std": 0.2651650384068489,
"rewards/accuracy_reward": 0.6875,
"rewards/format_reward": 0.71875,
"step": 327
},
{
"clip_ratio": 0.0,
"completion_length": 445.6875,
"epoch": 0.6833333333333333,
"grad_norm": 6.094662666320801,
"kl": 0.196044921875,
"learning_rate": 7.722222222222222e-07,
"loss": 0.0002,
"reward": 1.46875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.71875,
"rewards/format_reward": 0.75,
"step": 328
},
{
"clip_ratio": 0.0,
"completion_length": 450.625,
"epoch": 0.6854166666666667,
"grad_norm": 1.3171031475067139,
"kl": 0.18212890625,
"learning_rate": 7.715277777777777e-07,
"loss": 0.0002,
"reward": 0.78125,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.28125,
"rewards/format_reward": 0.5,
"step": 329
},
{
"clip_ratio": 0.0,
"completion_length": 462.59375,
"epoch": 0.6875,
"grad_norm": 2.229274034500122,
"kl": 0.191162109375,
"learning_rate": 7.708333333333333e-07,
"loss": 0.0002,
"reward": 1.90625,
"reward_std": 0.1293872892856598,
"rewards/accuracy_reward": 0.90625,
"rewards/format_reward": 1.0,
"step": 330
},
{
"clip_ratio": 0.0,
"completion_length": 423.5625,
"epoch": 0.6895833333333333,
"grad_norm": 2.1081833839416504,
"kl": 0.203857421875,
"learning_rate": 7.701388888888888e-07,
"loss": 0.0002,
"reward": 1.34375,
"reward_std": 0.2773705795407295,
"rewards/accuracy_reward": 0.625,
"rewards/format_reward": 0.71875,
"step": 331
},
{
"clip_ratio": 0.0,
"completion_length": 446.5625,
"epoch": 0.6916666666666667,
"grad_norm": 0.007810765411704779,
"kl": 0.1826171875,
"learning_rate": 7.694444444444445e-07,
"loss": 0.0002,
"reward": 0.75,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.5,
"step": 332
},
{
"clip_ratio": 0.0,
"completion_length": 421.4375,
"epoch": 0.69375,
"grad_norm": 2.289745569229126,
"kl": 0.190185546875,
"learning_rate": 7.6875e-07,
"loss": 0.0002,
"reward": 1.375,
"reward_std": 0.2177756354212761,
"rewards/accuracy_reward": 0.375,
"rewards/format_reward": 1.0,
"step": 333
},
{
"clip_ratio": 0.0,
"completion_length": 443.3125,
"epoch": 0.6958333333333333,
"grad_norm": 1.1293355226516724,
"kl": 0.192626953125,
"learning_rate": 7.680555555555556e-07,
"loss": 0.0002,
"reward": 1.21875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.46875,
"rewards/format_reward": 0.75,
"step": 334
},
{
"clip_ratio": 0.0,
"completion_length": 410.21875,
"epoch": 0.6979166666666666,
"grad_norm": 1.968024492263794,
"kl": 0.200927734375,
"learning_rate": 7.673611111111112e-07,
"loss": 0.0002,
"reward": 0.96875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.46875,
"rewards/format_reward": 0.5,
"step": 335
},
{
"clip_ratio": 0.0,
"completion_length": 456.46875,
"epoch": 0.7,
"grad_norm": 2.0755598545074463,
"kl": 0.206298828125,
"learning_rate": 7.666666666666667e-07,
"loss": 0.0002,
"reward": 1.875,
"reward_std": 0.2177756354212761,
"rewards/accuracy_reward": 0.875,
"rewards/format_reward": 1.0,
"step": 336
},
{
"clip_ratio": 0.0,
"completion_length": 449.0,
"epoch": 0.7020833333333333,
"grad_norm": 1.1718143224716187,
"kl": 0.2001953125,
"learning_rate": 7.659722222222223e-07,
"loss": 0.0002,
"reward": 0.375,
"reward_std": 0.18898223340511322,
"rewards/accuracy_reward": 0.15625,
"rewards/format_reward": 0.21875,
"step": 337
},
{
"clip_ratio": 0.0,
"completion_length": 436.0,
"epoch": 0.7041666666666667,
"grad_norm": 1.2479380369186401,
"kl": 0.199462890625,
"learning_rate": 7.652777777777777e-07,
"loss": 0.0002,
"reward": 1.625,
"reward_std": 0.13363061845302582,
"rewards/accuracy_reward": 0.625,
"rewards/format_reward": 1.0,
"step": 338
},
{
"clip_ratio": 0.0,
"completion_length": 400.40625,
"epoch": 0.70625,
"grad_norm": 1.1230976581573486,
"kl": 0.204833984375,
"learning_rate": 7.645833333333332e-07,
"loss": 0.0002,
"reward": 0.96875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.46875,
"rewards/format_reward": 0.5,
"step": 339
},
{
"clip_ratio": 0.0,
"completion_length": 422.46875,
"epoch": 0.7083333333333334,
"grad_norm": 1.2302477359771729,
"kl": 0.209228515625,
"learning_rate": 7.638888888888888e-07,
"loss": 0.0002,
"reward": 1.53125,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.53125,
"rewards/format_reward": 1.0,
"step": 340
},
{
"clip_ratio": 0.0,
"completion_length": 392.40625,
"epoch": 0.7104166666666667,
"grad_norm": 0.010557924397289753,
"kl": 0.21630859375,
"learning_rate": 7.631944444444443e-07,
"loss": 0.0002,
"reward": 1.5,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.75,
"step": 341
},
{
"clip_ratio": 0.0,
"completion_length": 427.0,
"epoch": 0.7125,
"grad_norm": 2.4895095825195312,
"kl": 0.206298828125,
"learning_rate": 7.624999999999999e-07,
"loss": 0.0002,
"reward": 1.5625,
"reward_std": 0.4355512708425522,
"rewards/accuracy_reward": 0.5625,
"rewards/format_reward": 1.0,
"step": 342
},
{
"clip_ratio": 0.0,
"completion_length": 434.96875,
"epoch": 0.7145833333333333,
"grad_norm": 1.8133002519607544,
"kl": 0.19384765625,
"learning_rate": 7.618055555555554e-07,
"loss": 0.0002,
"reward": 1.46875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.71875,
"rewards/format_reward": 0.75,
"step": 343
},
{
"clip_ratio": 0.0,
"completion_length": 430.1875,
"epoch": 0.7166666666666667,
"grad_norm": 0.015499824658036232,
"kl": 0.216552734375,
"learning_rate": 7.61111111111111e-07,
"loss": 0.0002,
"reward": 1.5,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 1.0,
"step": 344
},
{
"clip_ratio": 0.0,
"completion_length": 428.46875,
"epoch": 0.71875,
"grad_norm": 0.012258811853826046,
"kl": 0.20947265625,
"learning_rate": 7.604166666666666e-07,
"loss": 0.0002,
"reward": 1.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.5,
"step": 345
},
{
"clip_ratio": 0.0,
"completion_length": 423.125,
"epoch": 0.7208333333333333,
"grad_norm": 1.241074562072754,
"kl": 0.1982421875,
"learning_rate": 7.597222222222221e-07,
"loss": 0.0002,
"reward": 0.46875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.21875,
"rewards/format_reward": 0.25,
"step": 346
},
{
"clip_ratio": 0.0,
"completion_length": 426.28125,
"epoch": 0.7229166666666667,
"grad_norm": 2.9186177253723145,
"kl": 0.204345703125,
"learning_rate": 7.590277777777778e-07,
"loss": 0.0002,
"reward": 0.6875,
"reward_std": 0.1157275140285492,
"rewards/accuracy_reward": 0.1875,
"rewards/format_reward": 0.5,
"step": 347
},
{
"clip_ratio": 0.0,
"completion_length": 429.46875,
"epoch": 0.725,
"grad_norm": 2.4491827487945557,
"kl": 0.199951171875,
"learning_rate": 7.583333333333333e-07,
"loss": 0.0002,
"reward": 0.90625,
"reward_std": 0.1293872892856598,
"rewards/accuracy_reward": 0.40625,
"rewards/format_reward": 0.5,
"step": 348
},
{
"clip_ratio": 0.0,
"completion_length": 432.15625,
"epoch": 0.7270833333333333,
"grad_norm": 1.914108157157898,
"kl": 0.21044921875,
"learning_rate": 7.576388888888889e-07,
"loss": 0.0002,
"reward": 0.9375,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.4375,
"rewards/format_reward": 0.5,
"step": 349
},
{
"clip_ratio": 0.0,
"completion_length": 419.53125,
"epoch": 0.7291666666666666,
"grad_norm": 2.4412832260131836,
"kl": 0.236572265625,
"learning_rate": 7.569444444444444e-07,
"loss": 0.0002,
"reward": 1.375,
"reward_std": 0.2177756354212761,
"rewards/accuracy_reward": 0.625,
"rewards/format_reward": 0.75,
"step": 350
},
{
"clip_ratio": 0.0,
"completion_length": 452.34375,
"epoch": 0.73125,
"grad_norm": 0.007453648839145899,
"kl": 0.197509765625,
"learning_rate": 7.5625e-07,
"loss": 0.0002,
"reward": 1.5,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.75,
"step": 351
},
{
"clip_ratio": 0.0,
"completion_length": 448.0625,
"epoch": 0.7333333333333333,
"grad_norm": 2.017571449279785,
"kl": 0.290283203125,
"learning_rate": 7.555555555555555e-07,
"loss": 0.0003,
"reward": 1.875,
"reward_std": 0.2177756354212761,
"rewards/accuracy_reward": 0.875,
"rewards/format_reward": 1.0,
"step": 352
},
{
"clip_ratio": 0.0,
"completion_length": 449.875,
"epoch": 0.7354166666666667,
"grad_norm": 1.6187738180160522,
"kl": 0.20166015625,
"learning_rate": 7.548611111111111e-07,
"loss": 0.0002,
"reward": 1.90625,
"reward_std": 0.2041158601641655,
"rewards/accuracy_reward": 0.90625,
"rewards/format_reward": 1.0,
"step": 353
},
{
"clip_ratio": 0.0,
"completion_length": 480.71875,
"epoch": 0.7375,
"grad_norm": 1.283926248550415,
"kl": 0.2021484375,
"learning_rate": 7.541666666666666e-07,
"loss": 0.0002,
"reward": 0.96875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.46875,
"rewards/format_reward": 0.5,
"step": 354
},
{
"clip_ratio": 0.0,
"completion_length": 453.125,
"epoch": 0.7395833333333334,
"grad_norm": 1.8160797357559204,
"kl": 0.226806640625,
"learning_rate": 7.534722222222222e-07,
"loss": 0.0002,
"reward": 1.375,
"reward_std": 0.2314550280570984,
"rewards/accuracy_reward": 0.625,
"rewards/format_reward": 0.75,
"step": 355
},
{
"clip_ratio": 0.0,
"completion_length": 456.15625,
"epoch": 0.7416666666666667,
"grad_norm": 2.673011541366577,
"kl": 0.205078125,
"learning_rate": 7.527777777777777e-07,
"loss": 0.0002,
"reward": 1.25,
"reward_std": 0.3514062538743019,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.75,
"step": 356
},
{
"clip_ratio": 0.0,
"completion_length": 459.96875,
"epoch": 0.74375,
"grad_norm": 1.9269709587097168,
"kl": 0.19482421875,
"learning_rate": 7.520833333333333e-07,
"loss": 0.0002,
"reward": 1.4375,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.6875,
"rewards/format_reward": 0.75,
"step": 357
},
{
"clip_ratio": 0.0,
"completion_length": 456.09375,
"epoch": 0.7458333333333333,
"grad_norm": 12.258075714111328,
"kl": 0.20458984375,
"learning_rate": 7.513888888888888e-07,
"loss": 0.0002,
"reward": 1.9375,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.9375,
"rewards/format_reward": 1.0,
"step": 358
},
{
"clip_ratio": 0.0,
"completion_length": 419.71875,
"epoch": 0.7479166666666667,
"grad_norm": 2.0176916122436523,
"kl": 0.210205078125,
"learning_rate": 7.506944444444444e-07,
"loss": 0.0002,
"reward": 1.21875,
"reward_std": 0.2041158601641655,
"rewards/accuracy_reward": 0.46875,
"rewards/format_reward": 0.75,
"step": 359
},
{
"clip_ratio": 0.0,
"completion_length": 425.75,
"epoch": 0.75,
"grad_norm": 1.29171621799469,
"kl": 0.2138671875,
"learning_rate": 7.5e-07,
"loss": 0.0002,
"reward": 1.71875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.71875,
"rewards/format_reward": 1.0,
"step": 360
},
{
"clip_ratio": 0.0,
"completion_length": 421.0625,
"epoch": 0.7520833333333333,
"grad_norm": 1.6412715911865234,
"kl": 0.21044921875,
"learning_rate": 7.493055555555556e-07,
"loss": 0.0002,
"reward": 1.46875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.71875,
"rewards/format_reward": 0.75,
"step": 361
},
{
"clip_ratio": 0.0,
"completion_length": 426.15625,
"epoch": 0.7541666666666667,
"grad_norm": 2.2163078784942627,
"kl": 0.227783203125,
"learning_rate": 7.486111111111111e-07,
"loss": 0.0002,
"reward": 1.40625,
"reward_std": 0.2651650384068489,
"rewards/accuracy_reward": 0.65625,
"rewards/format_reward": 0.75,
"step": 362
},
{
"clip_ratio": 0.0,
"completion_length": 432.28125,
"epoch": 0.75625,
"grad_norm": 0.5271323323249817,
"kl": 0.562744140625,
"learning_rate": 7.479166666666667e-07,
"loss": 0.0006,
"reward": 0.5,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.5,
"step": 363
},
{
"clip_ratio": 0.0,
"completion_length": 422.5,
"epoch": 0.7583333333333333,
"grad_norm": 3.3099429607391357,
"kl": 0.22119140625,
"learning_rate": 7.472222222222222e-07,
"loss": 0.0002,
"reward": 1.84375,
"reward_std": 0.3061639815568924,
"rewards/accuracy_reward": 0.84375,
"rewards/format_reward": 1.0,
"step": 364
},
{
"clip_ratio": 0.0,
"completion_length": 434.96875,
"epoch": 0.7604166666666666,
"grad_norm": 0.02184051275253296,
"kl": 0.22314453125,
"learning_rate": 7.465277777777778e-07,
"loss": 0.0002,
"reward": 2.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 1.0,
"rewards/format_reward": 1.0,
"step": 365
},
{
"clip_ratio": 0.0,
"completion_length": 397.84375,
"epoch": 0.7625,
"grad_norm": 2.9574718475341797,
"kl": 0.2255859375,
"learning_rate": 7.458333333333333e-07,
"loss": 0.0002,
"reward": 1.1875,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.4375,
"rewards/format_reward": 0.75,
"step": 366
},
{
"clip_ratio": 0.0,
"completion_length": 427.90625,
"epoch": 0.7645833333333333,
"grad_norm": 1.6166619062423706,
"kl": 0.227294921875,
"learning_rate": 7.451388888888889e-07,
"loss": 0.0002,
"reward": 0.65625,
"reward_std": 0.1293872892856598,
"rewards/accuracy_reward": 0.15625,
"rewards/format_reward": 0.5,
"step": 367
},
{
"clip_ratio": 0.0,
"completion_length": 400.875,
"epoch": 0.7666666666666667,
"grad_norm": 1.139664888381958,
"kl": 0.222412109375,
"learning_rate": 7.444444444444444e-07,
"loss": 0.0002,
"reward": 1.46875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.71875,
"rewards/format_reward": 0.75,
"step": 368
},
{
"clip_ratio": 0.0,
"completion_length": 408.625,
"epoch": 0.76875,
"grad_norm": 1.73232901096344,
"kl": 0.248779296875,
"learning_rate": 7.4375e-07,
"loss": 0.0002,
"reward": 1.90625,
"reward_std": 0.2041158601641655,
"rewards/accuracy_reward": 0.90625,
"rewards/format_reward": 1.0,
"step": 369
},
{
"clip_ratio": 0.0,
"completion_length": 412.625,
"epoch": 0.7708333333333334,
"grad_norm": 0.019330745562911034,
"kl": 0.22314453125,
"learning_rate": 7.430555555555555e-07,
"loss": 0.0002,
"reward": 0.75,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.75,
"step": 370
},
{
"clip_ratio": 0.0,
"completion_length": 397.0,
"epoch": 0.7729166666666667,
"grad_norm": 2.8990607261657715,
"kl": 0.215087890625,
"learning_rate": 7.423611111111111e-07,
"loss": 0.0002,
"reward": 1.71875,
"reward_std": 0.24511480331420898,
"rewards/accuracy_reward": 0.71875,
"rewards/format_reward": 1.0,
"step": 371
},
{
"clip_ratio": 0.0,
"completion_length": 416.6875,
"epoch": 0.775,
"grad_norm": 1.9207866191864014,
"kl": 0.217529296875,
"learning_rate": 7.416666666666666e-07,
"loss": 0.0002,
"reward": 1.4375,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.6875,
"rewards/format_reward": 0.75,
"step": 372
},
{
"clip_ratio": 0.0,
"completion_length": 362.5625,
"epoch": 0.7770833333333333,
"grad_norm": 0.010317516513168812,
"kl": 0.248779296875,
"learning_rate": 7.409722222222222e-07,
"loss": 0.0002,
"reward": 0.5,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.25,
"step": 373
},
{
"clip_ratio": 0.0,
"completion_length": 374.5625,
"epoch": 0.7791666666666667,
"grad_norm": 1.9645934104919434,
"kl": 0.240966796875,
"learning_rate": 7.402777777777778e-07,
"loss": 0.0002,
"reward": 1.9375,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.9375,
"rewards/format_reward": 1.0,
"step": 374
},
{
"clip_ratio": 0.0,
"completion_length": 380.875,
"epoch": 0.78125,
"grad_norm": 3.6975531578063965,
"kl": 0.228759765625,
"learning_rate": 7.395833333333334e-07,
"loss": 0.0002,
"reward": 1.84375,
"reward_std": 0.1293872892856598,
"rewards/accuracy_reward": 0.84375,
"rewards/format_reward": 1.0,
"step": 375
},
{
"clip_ratio": 0.0,
"completion_length": 384.5,
"epoch": 0.7833333333333333,
"grad_norm": 0.015749173238873482,
"kl": 0.24609375,
"learning_rate": 7.388888888888889e-07,
"loss": 0.0002,
"reward": 1.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.75,
"step": 376
},
{
"clip_ratio": 0.0,
"completion_length": 355.125,
"epoch": 0.7854166666666667,
"grad_norm": 1.320116400718689,
"kl": 0.27734375,
"learning_rate": 7.381944444444445e-07,
"loss": 0.0003,
"reward": 0.34375,
"reward_std": 0.1293872892856598,
"rewards/accuracy_reward": 0.09375,
"rewards/format_reward": 0.25,
"step": 377
},
{
"clip_ratio": 0.0,
"completion_length": 386.40625,
"epoch": 0.7875,
"grad_norm": 1.221765398979187,
"kl": 0.247802734375,
"learning_rate": 7.375e-07,
"loss": 0.0002,
"reward": 0.4375,
"reward_std": 0.1157275140285492,
"rewards/accuracy_reward": 0.1875,
"rewards/format_reward": 0.25,
"step": 378
},
{
"clip_ratio": 0.0,
"completion_length": 345.875,
"epoch": 0.7895833333333333,
"grad_norm": 2.8052141666412354,
"kl": 0.269287109375,
"learning_rate": 7.368055555555556e-07,
"loss": 0.0003,
"reward": 1.46875,
"reward_std": 0.2651650384068489,
"rewards/accuracy_reward": 0.46875,
"rewards/format_reward": 1.0,
"step": 379
},
{
"clip_ratio": 0.0,
"completion_length": 354.96875,
"epoch": 0.7916666666666666,
"grad_norm": 1.387290596961975,
"kl": 0.241455078125,
"learning_rate": 7.361111111111111e-07,
"loss": 0.0002,
"reward": 0.90625,
"reward_std": 0.1293872892856598,
"rewards/accuracy_reward": 0.40625,
"rewards/format_reward": 0.5,
"step": 380
},
{
"clip_ratio": 0.0,
"completion_length": 368.03125,
"epoch": 0.79375,
"grad_norm": 1.335808515548706,
"kl": 0.23779296875,
"learning_rate": 7.354166666666667e-07,
"loss": 0.0002,
"reward": 1.9375,
"reward_std": 0.1157275140285492,
"rewards/accuracy_reward": 0.9375,
"rewards/format_reward": 1.0,
"step": 381
},
{
"clip_ratio": 0.0,
"completion_length": 359.78125,
"epoch": 0.7958333333333333,
"grad_norm": 0.013379854150116444,
"kl": 0.244873046875,
"learning_rate": 7.347222222222222e-07,
"loss": 0.0002,
"reward": 1.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.75,
"step": 382
},
{
"clip_ratio": 0.0,
"completion_length": 357.90625,
"epoch": 0.7979166666666667,
"grad_norm": 80.3907699584961,
"kl": 0.2431640625,
"learning_rate": 7.340277777777777e-07,
"loss": 0.0002,
"reward": 1.375,
"reward_std": 0.2925042062997818,
"rewards/accuracy_reward": 0.625,
"rewards/format_reward": 0.75,
"step": 383
},
{
"clip_ratio": 0.0,
"completion_length": 364.09375,
"epoch": 0.8,
"grad_norm": 0.011957396753132343,
"kl": 0.24072265625,
"learning_rate": 7.333333333333332e-07,
"loss": 0.0002,
"reward": 2.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 1.0,
"rewards/format_reward": 1.0,
"step": 384
},
{
"clip_ratio": 0.0,
"completion_length": 320.0625,
"epoch": 0.8020833333333334,
"grad_norm": 2.5682637691497803,
"kl": 0.2353515625,
"learning_rate": 7.326388888888888e-07,
"loss": 0.0002,
"reward": 1.59375,
"reward_std": 0.22201896458864212,
"rewards/accuracy_reward": 0.59375,
"rewards/format_reward": 1.0,
"step": 385
},
{
"clip_ratio": 0.0,
"completion_length": 349.0,
"epoch": 0.8041666666666667,
"grad_norm": 3.7549641132354736,
"kl": 0.228515625,
"learning_rate": 7.319444444444443e-07,
"loss": 0.0002,
"reward": 1.78125,
"reward_std": 0.24511480331420898,
"rewards/accuracy_reward": 0.78125,
"rewards/format_reward": 1.0,
"step": 386
},
{
"clip_ratio": 0.0,
"completion_length": 358.53125,
"epoch": 0.80625,
"grad_norm": 0.009337302297353745,
"kl": 0.224609375,
"learning_rate": 7.312499999999999e-07,
"loss": 0.0002,
"reward": 1.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.75,
"step": 387
},
{
"clip_ratio": 0.0,
"completion_length": 360.34375,
"epoch": 0.8083333333333333,
"grad_norm": 1.9596688747406006,
"kl": 0.2333984375,
"learning_rate": 7.305555555555554e-07,
"loss": 0.0002,
"reward": 1.90625,
"reward_std": 0.2041158601641655,
"rewards/accuracy_reward": 0.90625,
"rewards/format_reward": 1.0,
"step": 388
},
{
"clip_ratio": 0.0,
"completion_length": 330.0,
"epoch": 0.8104166666666667,
"grad_norm": 0.01328630093485117,
"kl": 0.242431640625,
"learning_rate": 7.298611111111111e-07,
"loss": 0.0002,
"reward": 1.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.5,
"step": 389
},
{
"clip_ratio": 0.0,
"completion_length": 347.5,
"epoch": 0.8125,
"grad_norm": 3.5032737255096436,
"kl": 0.228271484375,
"learning_rate": 7.291666666666666e-07,
"loss": 0.0002,
"reward": 0.65625,
"reward_std": 0.22201896458864212,
"rewards/accuracy_reward": 0.15625,
"rewards/format_reward": 0.5,
"step": 390
},
{
"clip_ratio": 0.0,
"completion_length": 340.875,
"epoch": 0.8145833333333333,
"grad_norm": 1.401888370513916,
"kl": 0.28076171875,
"learning_rate": 7.284722222222222e-07,
"loss": 0.0003,
"reward": 1.9375,
"reward_std": 0.1157275140285492,
"rewards/accuracy_reward": 0.9375,
"rewards/format_reward": 1.0,
"step": 391
},
{
"clip_ratio": 0.0,
"completion_length": 340.25,
"epoch": 0.8166666666666667,
"grad_norm": 1.3528683185577393,
"kl": 0.249755859375,
"learning_rate": 7.277777777777777e-07,
"loss": 0.0002,
"reward": 1.46875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.71875,
"rewards/format_reward": 0.75,
"step": 392
},
{
"clip_ratio": 0.0,
"completion_length": 365.28125,
"epoch": 0.81875,
"grad_norm": 1.6087630987167358,
"kl": 0.22900390625,
"learning_rate": 7.270833333333333e-07,
"loss": 0.0002,
"reward": 1.40625,
"reward_std": 0.1293872892856598,
"rewards/accuracy_reward": 0.40625,
"rewards/format_reward": 1.0,
"step": 393
},
{
"clip_ratio": 0.0,
"completion_length": 385.9375,
"epoch": 0.8208333333333333,
"grad_norm": 2.008995771408081,
"kl": 0.228515625,
"learning_rate": 7.263888888888888e-07,
"loss": 0.0002,
"reward": 1.65625,
"reward_std": 0.22201896458864212,
"rewards/accuracy_reward": 0.65625,
"rewards/format_reward": 1.0,
"step": 394
},
{
"clip_ratio": 0.0,
"completion_length": 366.625,
"epoch": 0.8229166666666666,
"grad_norm": 0.011729438789188862,
"kl": 0.25,
"learning_rate": 7.256944444444444e-07,
"loss": 0.0002,
"reward": 1.5,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.75,
"step": 395
},
{
"clip_ratio": 0.0,
"completion_length": 353.78125,
"epoch": 0.825,
"grad_norm": 1.4965615272521973,
"kl": 0.25634765625,
"learning_rate": 7.249999999999999e-07,
"loss": 0.0003,
"reward": 0.96875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.46875,
"rewards/format_reward": 0.5,
"step": 396
},
{
"clip_ratio": 0.0,
"completion_length": 372.625,
"epoch": 0.8270833333333333,
"grad_norm": 1.816392183303833,
"kl": 0.222412109375,
"learning_rate": 7.243055555555555e-07,
"loss": 0.0002,
"reward": 0.96875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.46875,
"rewards/format_reward": 0.5,
"step": 397
},
{
"clip_ratio": 0.0,
"completion_length": 410.65625,
"epoch": 0.8291666666666667,
"grad_norm": 0.011323979124426842,
"kl": 0.22607421875,
"learning_rate": 7.23611111111111e-07,
"loss": 0.0002,
"reward": 1.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.5,
"step": 398
},
{
"clip_ratio": 0.0,
"completion_length": 410.3125,
"epoch": 0.83125,
"grad_norm": 6.289952754974365,
"kl": 0.220703125,
"learning_rate": 7.229166666666666e-07,
"loss": 0.0002,
"reward": 0.78125,
"reward_std": 0.24511480331420898,
"rewards/accuracy_reward": 0.28125,
"rewards/format_reward": 0.5,
"step": 399
},
{
"clip_ratio": 0.0,
"completion_length": 388.21875,
"epoch": 0.8333333333333334,
"grad_norm": 2.264404296875,
"kl": 0.20703125,
"learning_rate": 7.222222222222221e-07,
"loss": 0.0002,
"reward": 0.9375,
"reward_std": 0.1157275140285492,
"rewards/accuracy_reward": 0.4375,
"rewards/format_reward": 0.5,
"step": 400
},
{
"clip_ratio": 0.0,
"completion_length": 404.875,
"epoch": 0.8354166666666667,
"grad_norm": 2.491809368133545,
"kl": 0.220703125,
"learning_rate": 7.215277777777777e-07,
"loss": 0.0002,
"reward": 1.84375,
"reward_std": 0.24511480331420898,
"rewards/accuracy_reward": 0.84375,
"rewards/format_reward": 1.0,
"step": 401
},
{
"clip_ratio": 0.0,
"completion_length": 411.34375,
"epoch": 0.8375,
"grad_norm": 1.1848517656326294,
"kl": 0.21728515625,
"learning_rate": 7.208333333333332e-07,
"loss": 0.0002,
"reward": 1.71875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.71875,
"rewards/format_reward": 1.0,
"step": 402
},
{
"clip_ratio": 0.0,
"completion_length": 402.21875,
"epoch": 0.8395833333333333,
"grad_norm": 1.2143757343292236,
"kl": 0.22607421875,
"learning_rate": 7.201388888888889e-07,
"loss": 0.0002,
"reward": 0.96875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.46875,
"rewards/format_reward": 0.5,
"step": 403
},
{
"clip_ratio": 0.0,
"completion_length": 437.625,
"epoch": 0.8416666666666667,
"grad_norm": 0.008448748849332333,
"kl": 0.208984375,
"learning_rate": 7.194444444444445e-07,
"loss": 0.0002,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.25,
"step": 404
},
{
"clip_ratio": 0.0,
"completion_length": 416.96875,
"epoch": 0.84375,
"grad_norm": 0.007129390258342028,
"kl": 0.20654296875,
"learning_rate": 7.1875e-07,
"loss": 0.0002,
"reward": 1.5,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.75,
"step": 405
},
{
"clip_ratio": 0.0,
"completion_length": 431.0625,
"epoch": 0.8458333333333333,
"grad_norm": 2.8381500244140625,
"kl": 0.219482421875,
"learning_rate": 7.180555555555556e-07,
"loss": 0.0002,
"reward": 1.65625,
"reward_std": 0.4628904387354851,
"rewards/accuracy_reward": 0.65625,
"rewards/format_reward": 1.0,
"step": 406
},
{
"clip_ratio": 0.0,
"completion_length": 428.5,
"epoch": 0.8479166666666667,
"grad_norm": 2.8964383602142334,
"kl": 0.20703125,
"learning_rate": 7.173611111111111e-07,
"loss": 0.0002,
"reward": 1.6875,
"reward_std": 0.2925042062997818,
"rewards/accuracy_reward": 0.6875,
"rewards/format_reward": 1.0,
"step": 407
},
{
"clip_ratio": 0.0,
"completion_length": 415.1875,
"epoch": 0.85,
"grad_norm": 0.03928419202566147,
"kl": 0.221435546875,
"learning_rate": 7.166666666666667e-07,
"loss": 0.0002,
"reward": 1.5,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.75,
"step": 408
},
{
"clip_ratio": 0.0,
"completion_length": 442.46875,
"epoch": 0.8520833333333333,
"grad_norm": 1.7243295907974243,
"kl": 0.2099609375,
"learning_rate": 7.159722222222222e-07,
"loss": 0.0002,
"reward": 1.34375,
"reward_std": 0.24511480331420898,
"rewards/accuracy_reward": 0.59375,
"rewards/format_reward": 0.75,
"step": 409
},
{
"clip_ratio": 0.0,
"completion_length": 427.1875,
"epoch": 0.8541666666666666,
"grad_norm": 1.768519401550293,
"kl": 0.199951171875,
"learning_rate": 7.152777777777778e-07,
"loss": 0.0002,
"reward": 1.4375,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.6875,
"rewards/format_reward": 0.75,
"step": 410
},
{
"clip_ratio": 0.0,
"completion_length": 396.5,
"epoch": 0.85625,
"grad_norm": 0.018853794783353806,
"kl": 0.239013671875,
"learning_rate": 7.145833333333333e-07,
"loss": 0.0002,
"reward": 1.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.75,
"step": 411
},
{
"clip_ratio": 0.0,
"completion_length": 421.9375,
"epoch": 0.8583333333333333,
"grad_norm": 1.988168716430664,
"kl": 0.22119140625,
"learning_rate": 7.138888888888889e-07,
"loss": 0.0002,
"reward": 1.09375,
"reward_std": 0.24511480331420898,
"rewards/accuracy_reward": 0.34375,
"rewards/format_reward": 0.75,
"step": 412
},
{
"clip_ratio": 0.0,
"completion_length": 456.15625,
"epoch": 0.8604166666666667,
"grad_norm": 0.010176747106015682,
"kl": 0.214599609375,
"learning_rate": 7.131944444444444e-07,
"loss": 0.0002,
"reward": 1.5,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.75,
"step": 413
},
{
"clip_ratio": 0.0,
"completion_length": 437.90625,
"epoch": 0.8625,
"grad_norm": 2.550144910812378,
"kl": 0.22021484375,
"learning_rate": 7.125e-07,
"loss": 0.0002,
"reward": 1.0,
"reward_std": 0.2925042062997818,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.75,
"step": 414
},
{
"clip_ratio": 0.0,
"completion_length": 418.8125,
"epoch": 0.8645833333333334,
"grad_norm": 0.018261313438415527,
"kl": 0.200927734375,
"learning_rate": 7.118055555555555e-07,
"loss": 0.0002,
"reward": 1.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.5,
"step": 415
},
{
"clip_ratio": 0.0,
"completion_length": 443.71875,
"epoch": 0.8666666666666667,
"grad_norm": 0.008350728079676628,
"kl": 0.212646484375,
"learning_rate": 7.111111111111111e-07,
"loss": 0.0002,
"reward": 1.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.75,
"step": 416
},
{
"clip_ratio": 0.0,
"completion_length": 456.125,
"epoch": 0.86875,
"grad_norm": 2.7766783237457275,
"kl": 0.21484375,
"learning_rate": 7.104166666666667e-07,
"loss": 0.0002,
"reward": 1.34375,
"reward_std": 0.22201896458864212,
"rewards/accuracy_reward": 0.59375,
"rewards/format_reward": 0.75,
"step": 417
},
{
"clip_ratio": 0.0,
"completion_length": 434.53125,
"epoch": 0.8708333333333333,
"grad_norm": 1.4548749923706055,
"kl": 0.215087890625,
"learning_rate": 7.097222222222223e-07,
"loss": 0.0002,
"reward": 1.4375,
"reward_std": 0.1157275140285492,
"rewards/accuracy_reward": 0.6875,
"rewards/format_reward": 0.75,
"step": 418
},
{
"clip_ratio": 0.0,
"completion_length": 434.1875,
"epoch": 0.8729166666666667,
"grad_norm": 0.007628277875483036,
"kl": 0.211181640625,
"learning_rate": 7.090277777777778e-07,
"loss": 0.0002,
"reward": 0.5,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.25,
"step": 419
},
{
"clip_ratio": 0.0,
"completion_length": 454.5,
"epoch": 0.875,
"grad_norm": 2.389847993850708,
"kl": 0.203125,
"learning_rate": 7.083333333333334e-07,
"loss": 0.0002,
"reward": 0.4375,
"reward_std": 0.1157275140285492,
"rewards/accuracy_reward": 0.1875,
"rewards/format_reward": 0.25,
"step": 420
},
{
"clip_ratio": 0.0,
"completion_length": 438.71875,
"epoch": 0.8770833333333333,
"grad_norm": 1.6909396648406982,
"kl": 0.21923828125,
"learning_rate": 7.076388888888889e-07,
"loss": 0.0002,
"reward": 1.8125,
"reward_std": 0.1157275140285492,
"rewards/accuracy_reward": 0.8125,
"rewards/format_reward": 1.0,
"step": 421
},
{
"clip_ratio": 0.0,
"completion_length": 459.84375,
"epoch": 0.8791666666666667,
"grad_norm": 2.283431053161621,
"kl": 0.2138671875,
"learning_rate": 7.069444444444445e-07,
"loss": 0.0002,
"reward": 1.5,
"reward_std": 0.2587745785713196,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 1.0,
"step": 422
},
{
"clip_ratio": 0.0,
"completion_length": 434.28125,
"epoch": 0.88125,
"grad_norm": 1.1481057405471802,
"kl": 0.2109375,
"learning_rate": 7.0625e-07,
"loss": 0.0002,
"reward": 1.9375,
"reward_std": 0.1157275140285492,
"rewards/accuracy_reward": 0.9375,
"rewards/format_reward": 1.0,
"step": 423
},
{
"clip_ratio": 0.0,
"completion_length": 477.03125,
"epoch": 0.8833333333333333,
"grad_norm": 0.01631513424217701,
"kl": 0.22802734375,
"learning_rate": 7.055555555555556e-07,
"loss": 0.0002,
"reward": 1.5,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.75,
"step": 424
},
{
"clip_ratio": 0.0,
"completion_length": 445.1875,
"epoch": 0.8854166666666666,
"grad_norm": 1.1521072387695312,
"kl": 0.21484375,
"learning_rate": 7.048611111111111e-07,
"loss": 0.0002,
"reward": 0.9375,
"reward_std": 0.1157275140285492,
"rewards/accuracy_reward": 0.4375,
"rewards/format_reward": 0.5,
"step": 425
},
{
"clip_ratio": 0.0,
"completion_length": 469.78125,
"epoch": 0.8875,
"grad_norm": 4.860723495483398,
"kl": 0.211181640625,
"learning_rate": 7.041666666666667e-07,
"loss": 0.0002,
"reward": 1.46875,
"reward_std": 0.2041158601641655,
"rewards/accuracy_reward": 0.46875,
"rewards/format_reward": 1.0,
"step": 426
},
{
"clip_ratio": 0.0,
"completion_length": 426.84375,
"epoch": 0.8895833333333333,
"grad_norm": 0.011157250963151455,
"kl": 0.23193359375,
"learning_rate": 7.034722222222222e-07,
"loss": 0.0002,
"reward": 1.75,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 1.0,
"step": 427
},
{
"clip_ratio": 0.0,
"completion_length": 470.78125,
"epoch": 0.8916666666666667,
"grad_norm": 1.62472665309906,
"kl": 0.231201171875,
"learning_rate": 7.027777777777777e-07,
"loss": 0.0002,
"reward": 0.96875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.21875,
"rewards/format_reward": 0.75,
"step": 428
},
{
"clip_ratio": 0.0,
"completion_length": 433.65625,
"epoch": 0.89375,
"grad_norm": 2.0805013179779053,
"kl": 0.21826171875,
"learning_rate": 7.020833333333332e-07,
"loss": 0.0002,
"reward": 1.40625,
"reward_std": 0.2651650384068489,
"rewards/accuracy_reward": 0.65625,
"rewards/format_reward": 0.75,
"step": 429
},
{
"clip_ratio": 0.0,
"completion_length": 425.59375,
"epoch": 0.8958333333333334,
"grad_norm": 0.012136026285588741,
"kl": 0.23193359375,
"learning_rate": 7.013888888888888e-07,
"loss": 0.0002,
"reward": 0.75,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.5,
"step": 430
},
{
"clip_ratio": 0.0,
"completion_length": 445.21875,
"epoch": 0.8979166666666667,
"grad_norm": 1.2128183841705322,
"kl": 0.22119140625,
"learning_rate": 7.006944444444444e-07,
"loss": 0.0002,
"reward": 1.28125,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.53125,
"rewards/format_reward": 0.75,
"step": 431
},
{
"clip_ratio": 0.0,
"completion_length": 446.1875,
"epoch": 0.9,
"grad_norm": 1.5345137119293213,
"kl": 0.217529296875,
"learning_rate": 7e-07,
"loss": 0.0002,
"reward": 1.25,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.75,
"step": 432
},
{
"clip_ratio": 0.0,
"completion_length": 388.125,
"epoch": 0.9020833333333333,
"grad_norm": 1.3869545459747314,
"kl": 0.236083984375,
"learning_rate": 6.993055555555555e-07,
"loss": 0.0002,
"reward": 1.4375,
"reward_std": 0.1157275140285492,
"rewards/accuracy_reward": 0.6875,
"rewards/format_reward": 0.75,
"step": 433
},
{
"clip_ratio": 0.0,
"completion_length": 438.90625,
"epoch": 0.9041666666666667,
"grad_norm": 0.009476087987422943,
"kl": 0.215576171875,
"learning_rate": 6.986111111111111e-07,
"loss": 0.0002,
"reward": 1.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.5,
"step": 434
},
{
"clip_ratio": 0.0,
"completion_length": 445.125,
"epoch": 0.90625,
"grad_norm": 0.010908279567956924,
"kl": 0.235107421875,
"learning_rate": 6.979166666666666e-07,
"loss": 0.0002,
"reward": 1.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.5,
"step": 435
},
{
"clip_ratio": 0.0,
"completion_length": 472.53125,
"epoch": 0.9083333333333333,
"grad_norm": 2.5724964141845703,
"kl": 0.225830078125,
"learning_rate": 6.972222222222222e-07,
"loss": 0.0002,
"reward": 1.3125,
"reward_std": 0.1157275140285492,
"rewards/accuracy_reward": 0.5625,
"rewards/format_reward": 0.75,
"step": 436
},
{
"clip_ratio": 0.0,
"completion_length": 421.3125,
"epoch": 0.9104166666666667,
"grad_norm": 1.8228105306625366,
"kl": 0.2275390625,
"learning_rate": 6.965277777777777e-07,
"loss": 0.0002,
"reward": 1.15625,
"reward_std": 0.22201896458864212,
"rewards/accuracy_reward": 0.40625,
"rewards/format_reward": 0.75,
"step": 437
},
{
"clip_ratio": 0.0,
"completion_length": 468.625,
"epoch": 0.9125,
"grad_norm": 0.011684931814670563,
"kl": 0.217529296875,
"learning_rate": 6.958333333333333e-07,
"loss": 0.0002,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.25,
"step": 438
},
{
"clip_ratio": 0.0,
"completion_length": 444.46875,
"epoch": 0.9145833333333333,
"grad_norm": 3.1498048305511475,
"kl": 0.257568359375,
"learning_rate": 6.951388888888888e-07,
"loss": 0.0003,
"reward": 1.25,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.75,
"step": 439
},
{
"clip_ratio": 0.0,
"completion_length": 445.75,
"epoch": 0.9166666666666666,
"grad_norm": 3.2582144737243652,
"kl": 0.224609375,
"learning_rate": 6.944444444444444e-07,
"loss": 0.0002,
"reward": 1.78125,
"reward_std": 0.3061639815568924,
"rewards/accuracy_reward": 0.78125,
"rewards/format_reward": 1.0,
"step": 440
},
{
"clip_ratio": 0.0,
"completion_length": 430.375,
"epoch": 0.91875,
"grad_norm": 3.094038963317871,
"kl": 0.238037109375,
"learning_rate": 6.937499999999999e-07,
"loss": 0.0002,
"reward": 0.875,
"reward_std": 0.2177756354212761,
"rewards/accuracy_reward": 0.375,
"rewards/format_reward": 0.5,
"step": 441
},
{
"clip_ratio": 0.0,
"completion_length": 424.59375,
"epoch": 0.9208333333333333,
"grad_norm": 7.279547214508057,
"kl": 0.23828125,
"learning_rate": 6.930555555555555e-07,
"loss": 0.0002,
"reward": 1.28125,
"reward_std": 0.2041158601641655,
"rewards/accuracy_reward": 0.53125,
"rewards/format_reward": 0.75,
"step": 442
},
{
"clip_ratio": 0.0,
"completion_length": 450.5625,
"epoch": 0.9229166666666667,
"grad_norm": 3.03475022315979,
"kl": 0.219970703125,
"learning_rate": 6.92361111111111e-07,
"loss": 0.0002,
"reward": 1.21875,
"reward_std": 0.3471629247069359,
"rewards/accuracy_reward": 0.46875,
"rewards/format_reward": 0.75,
"step": 443
},
{
"clip_ratio": 0.0,
"completion_length": 485.6875,
"epoch": 0.925,
"grad_norm": 2.076587677001953,
"kl": 0.222412109375,
"learning_rate": 6.916666666666666e-07,
"loss": 0.0002,
"reward": 1.375,
"reward_std": 0.2177756354212761,
"rewards/accuracy_reward": 0.65625,
"rewards/format_reward": 0.71875,
"step": 444
},
{
"clip_ratio": 0.0,
"completion_length": 439.34375,
"epoch": 0.9270833333333334,
"grad_norm": 1.3405368328094482,
"kl": 0.236572265625,
"learning_rate": 6.909722222222222e-07,
"loss": 0.0002,
"reward": 0.3125,
"reward_std": 0.1157275140285492,
"rewards/accuracy_reward": 0.0625,
"rewards/format_reward": 0.25,
"step": 445
},
{
"clip_ratio": 0.0,
"completion_length": 419.75,
"epoch": 0.9291666666666667,
"grad_norm": 2.1806628704071045,
"kl": 0.23779296875,
"learning_rate": 6.902777777777778e-07,
"loss": 0.0002,
"reward": 0.46875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.21875,
"rewards/format_reward": 0.25,
"step": 446
},
{
"clip_ratio": 0.0,
"completion_length": 419.5,
"epoch": 0.93125,
"grad_norm": 3.1020021438598633,
"kl": 0.2451171875,
"learning_rate": 6.895833333333333e-07,
"loss": 0.0002,
"reward": 1.84375,
"reward_std": 0.22201896458864212,
"rewards/accuracy_reward": 0.84375,
"rewards/format_reward": 1.0,
"step": 447
},
{
"clip_ratio": 0.0,
"completion_length": 459.1875,
"epoch": 0.9333333333333333,
"grad_norm": 0.0076973834075033665,
"kl": 0.21533203125,
"learning_rate": 6.888888888888889e-07,
"loss": 0.0002,
"reward": 1.5,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.75,
"step": 448
},
{
"clip_ratio": 0.0,
"completion_length": 414.625,
"epoch": 0.9354166666666667,
"grad_norm": 0.010062651708722115,
"kl": 0.24658203125,
"learning_rate": 6.881944444444444e-07,
"loss": 0.0002,
"reward": 0.75,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.5,
"step": 449
},
{
"clip_ratio": 0.0,
"completion_length": 417.28125,
"epoch": 0.9375,
"grad_norm": 4.19633674621582,
"kl": 0.237060546875,
"learning_rate": 6.875e-07,
"loss": 0.0002,
"reward": 1.1875,
"reward_std": 0.1157275140285492,
"rewards/accuracy_reward": 0.4375,
"rewards/format_reward": 0.75,
"step": 450
},
{
"clip_ratio": 0.0,
"completion_length": 411.34375,
"epoch": 0.9395833333333333,
"grad_norm": 2.232433795928955,
"kl": 0.251953125,
"learning_rate": 6.868055555555555e-07,
"loss": 0.0003,
"reward": 1.09375,
"reward_std": 0.24511480331420898,
"rewards/accuracy_reward": 0.34375,
"rewards/format_reward": 0.75,
"step": 451
},
{
"clip_ratio": 0.0,
"completion_length": 412.59375,
"epoch": 0.9416666666666667,
"grad_norm": 1.2692770957946777,
"kl": 0.24658203125,
"learning_rate": 6.861111111111111e-07,
"loss": 0.0002,
"reward": 0.3125,
"reward_std": 0.1157275140285492,
"rewards/accuracy_reward": 0.0625,
"rewards/format_reward": 0.25,
"step": 452
},
{
"clip_ratio": 0.0,
"completion_length": 404.96875,
"epoch": 0.94375,
"grad_norm": 0.007977345958352089,
"kl": 0.247314453125,
"learning_rate": 6.854166666666666e-07,
"loss": 0.0002,
"reward": 1.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.5,
"step": 453
},
{
"clip_ratio": 0.0,
"completion_length": 415.03125,
"epoch": 0.9458333333333333,
"grad_norm": 1.4225736856460571,
"kl": 0.239013671875,
"learning_rate": 6.847222222222222e-07,
"loss": 0.0002,
"reward": 1.71875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.71875,
"rewards/format_reward": 1.0,
"step": 454
},
{
"clip_ratio": 0.0,
"completion_length": 441.4375,
"epoch": 0.9479166666666666,
"grad_norm": 1.563883662223816,
"kl": 0.228759765625,
"learning_rate": 6.840277777777777e-07,
"loss": 0.0002,
"reward": 1.03125,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.28125,
"rewards/format_reward": 0.75,
"step": 455
},
{
"clip_ratio": 0.0,
"completion_length": 413.8125,
"epoch": 0.95,
"grad_norm": 1.4256101846694946,
"kl": 0.240966796875,
"learning_rate": 6.833333333333333e-07,
"loss": 0.0002,
"reward": 1.21875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.46875,
"rewards/format_reward": 0.75,
"step": 456
},
{
"clip_ratio": 0.0,
"completion_length": 448.03125,
"epoch": 0.9520833333333333,
"grad_norm": 1.4623489379882812,
"kl": 0.255859375,
"learning_rate": 6.826388888888888e-07,
"loss": 0.0003,
"reward": 0.96875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.46875,
"rewards/format_reward": 0.5,
"step": 457
},
{
"clip_ratio": 0.0,
"completion_length": 420.75,
"epoch": 0.9541666666666667,
"grad_norm": 1.2219001054763794,
"kl": 0.244140625,
"learning_rate": 6.819444444444444e-07,
"loss": 0.0002,
"reward": 1.09375,
"reward_std": 0.1293872892856598,
"rewards/accuracy_reward": 0.34375,
"rewards/format_reward": 0.75,
"step": 458
},
{
"clip_ratio": 0.0,
"completion_length": 412.90625,
"epoch": 0.95625,
"grad_norm": 3.5887980461120605,
"kl": 0.24169921875,
"learning_rate": 6.8125e-07,
"loss": 0.0002,
"reward": 0.8125,
"reward_std": 0.1157275140285492,
"rewards/accuracy_reward": 0.3125,
"rewards/format_reward": 0.5,
"step": 459
},
{
"clip_ratio": 0.0,
"completion_length": 427.4375,
"epoch": 0.9583333333333334,
"grad_norm": 1.426658272743225,
"kl": 0.246826171875,
"learning_rate": 6.805555555555556e-07,
"loss": 0.0002,
"reward": 1.15625,
"reward_std": 0.1293872892856598,
"rewards/accuracy_reward": 0.40625,
"rewards/format_reward": 0.75,
"step": 460
},
{
"clip_ratio": 0.0,
"completion_length": 466.71875,
"epoch": 0.9604166666666667,
"grad_norm": 1.7160536050796509,
"kl": 0.23779296875,
"learning_rate": 6.798611111111111e-07,
"loss": 0.0002,
"reward": 1.6875,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.6875,
"rewards/format_reward": 1.0,
"step": 461
},
{
"clip_ratio": 0.0,
"completion_length": 432.8125,
"epoch": 0.9625,
"grad_norm": 1.2215250730514526,
"kl": 0.25537109375,
"learning_rate": 6.791666666666667e-07,
"loss": 0.0003,
"reward": 1.34375,
"reward_std": 0.1293872892856598,
"rewards/accuracy_reward": 0.59375,
"rewards/format_reward": 0.75,
"step": 462
},
{
"clip_ratio": 0.0,
"completion_length": 499.6875,
"epoch": 0.9645833333333333,
"grad_norm": 0.016700007021427155,
"kl": 0.20458984375,
"learning_rate": 6.784722222222222e-07,
"loss": 0.0002,
"reward": 1.5,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.75,
"step": 463
},
{
"clip_ratio": 0.0,
"completion_length": 462.03125,
"epoch": 0.9666666666666667,
"grad_norm": 1.8133171796798706,
"kl": 0.23779296875,
"learning_rate": 6.777777777777778e-07,
"loss": 0.0002,
"reward": 1.1875,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.4375,
"rewards/format_reward": 0.75,
"step": 464
},
{
"clip_ratio": 0.0,
"completion_length": 493.59375,
"epoch": 0.96875,
"grad_norm": 1.1255253553390503,
"kl": 0.225830078125,
"learning_rate": 6.770833333333333e-07,
"loss": 0.0002,
"reward": 1.21875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.46875,
"rewards/format_reward": 0.75,
"step": 465
},
{
"clip_ratio": 0.0,
"completion_length": 527.71875,
"epoch": 0.9708333333333333,
"grad_norm": 0.08303828537464142,
"kl": 0.3046875,
"learning_rate": 6.763888888888889e-07,
"loss": 0.0003,
"reward": 0.5,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.25,
"step": 466
},
{
"clip_ratio": 0.0,
"completion_length": 520.34375,
"epoch": 0.9729166666666667,
"grad_norm": 2.6234073638916016,
"kl": 0.22119140625,
"learning_rate": 6.756944444444444e-07,
"loss": 0.0002,
"reward": 1.71875,
"reward_std": 0.2041158601641655,
"rewards/accuracy_reward": 0.71875,
"rewards/format_reward": 1.0,
"step": 467
},
{
"clip_ratio": 0.0,
"completion_length": 536.96875,
"epoch": 0.975,
"grad_norm": 1.0526636838912964,
"kl": 0.232177734375,
"learning_rate": 6.75e-07,
"loss": 0.0002,
"reward": 1.21875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.46875,
"rewards/format_reward": 0.75,
"step": 468
},
{
"clip_ratio": 0.0,
"completion_length": 523.46875,
"epoch": 0.9770833333333333,
"grad_norm": 1.8160419464111328,
"kl": 0.21826171875,
"learning_rate": 6.743055555555555e-07,
"loss": 0.0002,
"reward": 0.75,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.5,
"step": 469
},
{
"clip_ratio": 0.0,
"completion_length": 600.875,
"epoch": 0.9791666666666666,
"grad_norm": 2.412604331970215,
"kl": 0.236083984375,
"learning_rate": 6.736111111111111e-07,
"loss": 0.0002,
"reward": 1.09375,
"reward_std": 0.22201896458864212,
"rewards/accuracy_reward": 0.34375,
"rewards/format_reward": 0.75,
"step": 470
},
{
"clip_ratio": 0.0,
"completion_length": 551.78125,
"epoch": 0.98125,
"grad_norm": 0.0071896640583872795,
"kl": 0.21728515625,
"learning_rate": 6.729166666666666e-07,
"loss": 0.0002,
"reward": 1.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.75,
"step": 471
},
{
"clip_ratio": 0.0,
"completion_length": 583.75,
"epoch": 0.9833333333333333,
"grad_norm": 2.7071609497070312,
"kl": 0.215087890625,
"learning_rate": 6.722222222222222e-07,
"loss": 0.0002,
"reward": 0.71875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.21875,
"rewards/format_reward": 0.5,
"step": 472
},
{
"clip_ratio": 0.0,
"completion_length": 632.25,
"epoch": 0.9854166666666667,
"grad_norm": 1.6044063568115234,
"kl": 0.208251953125,
"learning_rate": 6.715277777777776e-07,
"loss": 0.0002,
"reward": 1.5625,
"reward_std": 0.2177756354212761,
"rewards/accuracy_reward": 0.5625,
"rewards/format_reward": 1.0,
"step": 473
},
{
"clip_ratio": 0.0,
"completion_length": 601.0,
"epoch": 0.9875,
"grad_norm": 1.518963098526001,
"kl": 0.22314453125,
"learning_rate": 6.708333333333333e-07,
"loss": 0.0002,
"reward": 0.34375,
"reward_std": 0.1293872892856598,
"rewards/accuracy_reward": 0.09375,
"rewards/format_reward": 0.25,
"step": 474
},
{
"clip_ratio": 0.0,
"completion_length": 638.34375,
"epoch": 0.9895833333333334,
"grad_norm": 14.804647445678711,
"kl": 0.2041015625,
"learning_rate": 6.701388888888888e-07,
"loss": 0.0002,
"reward": 0.875,
"reward_std": 0.2177756354212761,
"rewards/accuracy_reward": 0.125,
"rewards/format_reward": 0.75,
"step": 475
},
{
"clip_ratio": 0.0,
"completion_length": 631.0,
"epoch": 0.9916666666666667,
"grad_norm": 1.5678462982177734,
"kl": 0.20166015625,
"learning_rate": 6.694444444444444e-07,
"loss": 0.0002,
"reward": 0.84375,
"reward_std": 0.2651650384068489,
"rewards/accuracy_reward": 0.09375,
"rewards/format_reward": 0.75,
"step": 476
},
{
"clip_ratio": 0.0,
"completion_length": 617.875,
"epoch": 0.99375,
"grad_norm": 3.285810947418213,
"kl": 0.208251953125,
"learning_rate": 6.6875e-07,
"loss": 0.0002,
"reward": 1.6875,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.6875,
"rewards/format_reward": 1.0,
"step": 477
},
{
"clip_ratio": 0.0,
"completion_length": 620.03125,
"epoch": 0.9958333333333333,
"grad_norm": 1.3398975133895874,
"kl": 0.207275390625,
"learning_rate": 6.680555555555555e-07,
"loss": 0.0002,
"reward": 0.46875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.21875,
"rewards/format_reward": 0.25,
"step": 478
},
{
"clip_ratio": 0.0,
"completion_length": 626.5625,
"epoch": 0.9979166666666667,
"grad_norm": 1.2645913362503052,
"kl": 0.207763671875,
"learning_rate": 6.67361111111111e-07,
"loss": 0.0002,
"reward": 1.34375,
"reward_std": 0.22201896458864212,
"rewards/accuracy_reward": 0.59375,
"rewards/format_reward": 0.75,
"step": 479
},
{
"clip_ratio": 0.0,
"completion_length": 601.625,
"epoch": 1.0,
"grad_norm": 3.012862205505371,
"kl": 0.208984375,
"learning_rate": 6.666666666666666e-07,
"loss": 0.0002,
"reward": 1.375,
"reward_std": 0.2925042062997818,
"rewards/accuracy_reward": 0.625,
"rewards/format_reward": 0.75,
"step": 480
},
{
"clip_ratio": 0.0,
"completion_length": 644.46875,
"epoch": 1.0020833333333334,
"grad_norm": 0.9152086973190308,
"kl": 0.20849609375,
"learning_rate": 6.659722222222222e-07,
"loss": 0.0002,
"reward": 0.96875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.21875,
"rewards/format_reward": 0.75,
"step": 481
},
{
"clip_ratio": 0.0,
"completion_length": 682.34375,
"epoch": 1.0041666666666667,
"grad_norm": 2.6818764209747314,
"kl": 0.206787109375,
"learning_rate": 6.652777777777777e-07,
"loss": 0.0002,
"reward": 1.9375,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.9375,
"rewards/format_reward": 1.0,
"step": 482
},
{
"clip_ratio": 0.0,
"completion_length": 630.6875,
"epoch": 1.00625,
"grad_norm": 1.9887526035308838,
"kl": 0.211669921875,
"learning_rate": 6.645833333333333e-07,
"loss": 0.0002,
"reward": 0.84375,
"reward_std": 0.24511480331420898,
"rewards/accuracy_reward": 0.34375,
"rewards/format_reward": 0.5,
"step": 483
},
{
"clip_ratio": 0.0,
"completion_length": 689.90625,
"epoch": 1.0083333333333333,
"grad_norm": 1.1005487442016602,
"kl": 0.20654296875,
"learning_rate": 6.638888888888888e-07,
"loss": 0.0002,
"reward": 1.40625,
"reward_std": 0.1293872892856598,
"rewards/accuracy_reward": 0.65625,
"rewards/format_reward": 0.75,
"step": 484
},
{
"clip_ratio": 0.0,
"completion_length": 625.53125,
"epoch": 1.0104166666666667,
"grad_norm": 1.0807424783706665,
"kl": 0.2275390625,
"learning_rate": 6.631944444444444e-07,
"loss": 0.0002,
"reward": 1.875,
"reward_std": 0.13363061845302582,
"rewards/accuracy_reward": 0.875,
"rewards/format_reward": 1.0,
"step": 485
},
{
"clip_ratio": 0.0,
"completion_length": 620.4375,
"epoch": 1.0125,
"grad_norm": 0.008356544189155102,
"kl": 0.21875,
"learning_rate": 6.624999999999999e-07,
"loss": 0.0002,
"reward": 1.5,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.75,
"step": 486
},
{
"clip_ratio": 0.0,
"completion_length": 627.375,
"epoch": 1.0145833333333334,
"grad_norm": 2.017381191253662,
"kl": 0.22216796875,
"learning_rate": 6.618055555555555e-07,
"loss": 0.0002,
"reward": 0.84375,
"reward_std": 0.24511480331420898,
"rewards/accuracy_reward": 0.34375,
"rewards/format_reward": 0.5,
"step": 487
},
{
"clip_ratio": 0.0,
"completion_length": 634.78125,
"epoch": 1.0166666666666666,
"grad_norm": 2.344238042831421,
"kl": 0.21923828125,
"learning_rate": 6.611111111111111e-07,
"loss": 0.0002,
"reward": 0.6875,
"reward_std": 0.249358132481575,
"rewards/accuracy_reward": 0.1875,
"rewards/format_reward": 0.5,
"step": 488
},
{
"clip_ratio": 0.0,
"completion_length": 664.0625,
"epoch": 1.01875,
"grad_norm": 2.2267184257507324,
"kl": 0.2236328125,
"learning_rate": 6.604166666666667e-07,
"loss": 0.0002,
"reward": 1.21875,
"reward_std": 0.3471629247069359,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.71875,
"step": 489
},
{
"clip_ratio": 0.0,
"completion_length": 618.3125,
"epoch": 1.0208333333333333,
"grad_norm": 5.060998916625977,
"kl": 0.228515625,
"learning_rate": 6.597222222222222e-07,
"loss": 0.0002,
"reward": 1.59375,
"reward_std": 0.24511480331420898,
"rewards/accuracy_reward": 0.59375,
"rewards/format_reward": 1.0,
"step": 490
},
{
"clip_ratio": 0.0,
"completion_length": 603.3125,
"epoch": 1.0229166666666667,
"grad_norm": 1.5494803190231323,
"kl": 0.236083984375,
"learning_rate": 6.590277777777778e-07,
"loss": 0.0002,
"reward": 1.90625,
"reward_std": 0.2041158601641655,
"rewards/accuracy_reward": 0.90625,
"rewards/format_reward": 1.0,
"step": 491
},
{
"clip_ratio": 0.0,
"completion_length": 577.59375,
"epoch": 1.025,
"grad_norm": 0.006925213150680065,
"kl": 0.22021484375,
"learning_rate": 6.583333333333333e-07,
"loss": 0.0002,
"reward": 1.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.75,
"step": 492
},
{
"clip_ratio": 0.0,
"completion_length": 595.25,
"epoch": 1.0270833333333333,
"grad_norm": 1.8279296159744263,
"kl": 0.21240234375,
"learning_rate": 6.576388888888889e-07,
"loss": 0.0002,
"reward": 1.03125,
"reward_std": 0.2630179077386856,
"rewards/accuracy_reward": 0.28125,
"rewards/format_reward": 0.75,
"step": 493
},
{
"clip_ratio": 0.0,
"completion_length": 596.3125,
"epoch": 1.0291666666666666,
"grad_norm": 1.733322262763977,
"kl": 0.22802734375,
"learning_rate": 6.569444444444444e-07,
"loss": 0.0002,
"reward": 0.78125,
"reward_std": 0.2041158601641655,
"rewards/accuracy_reward": 0.28125,
"rewards/format_reward": 0.5,
"step": 494
},
{
"clip_ratio": 0.0,
"completion_length": 576.6875,
"epoch": 1.03125,
"grad_norm": 2.433555841445923,
"kl": 0.227294921875,
"learning_rate": 6.5625e-07,
"loss": 0.0002,
"reward": 1.75,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 1.0,
"step": 495
},
{
"clip_ratio": 0.0,
"completion_length": 502.59375,
"epoch": 1.0333333333333334,
"grad_norm": 0.009394297376275063,
"kl": 0.25244140625,
"learning_rate": 6.555555555555555e-07,
"loss": 0.0003,
"reward": 1.5,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.75,
"step": 496
},
{
"clip_ratio": 0.0,
"completion_length": 567.5625,
"epoch": 1.0354166666666667,
"grad_norm": 1.5645465850830078,
"kl": 0.248779296875,
"learning_rate": 6.548611111111111e-07,
"loss": 0.0002,
"reward": 1.21875,
"reward_std": 0.24511480331420898,
"rewards/accuracy_reward": 0.46875,
"rewards/format_reward": 0.75,
"step": 497
},
{
"clip_ratio": 0.0,
"completion_length": 567.0,
"epoch": 1.0375,
"grad_norm": 1.6025605201721191,
"kl": 0.2392578125,
"learning_rate": 6.541666666666666e-07,
"loss": 0.0002,
"reward": 1.9375,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.9375,
"rewards/format_reward": 1.0,
"step": 498
},
{
"clip_ratio": 0.0,
"completion_length": 555.65625,
"epoch": 1.0395833333333333,
"grad_norm": 1.1154199838638306,
"kl": 0.228271484375,
"learning_rate": 6.534722222222222e-07,
"loss": 0.0002,
"reward": 1.4375,
"reward_std": 0.1157275140285492,
"rewards/accuracy_reward": 0.6875,
"rewards/format_reward": 0.75,
"step": 499
},
{
"clip_ratio": 0.0,
"completion_length": 552.65625,
"epoch": 1.0416666666666667,
"grad_norm": 1.7574169635772705,
"kl": 0.26318359375,
"learning_rate": 6.527777777777777e-07,
"loss": 0.0003,
"reward": 0.78125,
"reward_std": 0.2630179077386856,
"rewards/accuracy_reward": 0.28125,
"rewards/format_reward": 0.5,
"step": 500
},
{
"clip_ratio": 0.0,
"completion_length": 554.09375,
"epoch": 1.04375,
"grad_norm": 1.7386115789413452,
"kl": 0.24609375,
"learning_rate": 6.520833333333333e-07,
"loss": 0.0002,
"reward": 1.46875,
"reward_std": 0.2630179077386856,
"rewards/accuracy_reward": 0.46875,
"rewards/format_reward": 1.0,
"step": 501
},
{
"clip_ratio": 0.0,
"completion_length": 512.1875,
"epoch": 1.0458333333333334,
"grad_norm": 1.2160353660583496,
"kl": 0.255615234375,
"learning_rate": 6.513888888888889e-07,
"loss": 0.0003,
"reward": 0.875,
"reward_std": 0.13363061845302582,
"rewards/accuracy_reward": 0.375,
"rewards/format_reward": 0.5,
"step": 502
},
{
"clip_ratio": 0.0,
"completion_length": 508.53125,
"epoch": 1.0479166666666666,
"grad_norm": 2.100022792816162,
"kl": 0.267333984375,
"learning_rate": 6.506944444444445e-07,
"loss": 0.0003,
"reward": 1.4375,
"reward_std": 0.1157275140285492,
"rewards/accuracy_reward": 0.6875,
"rewards/format_reward": 0.75,
"step": 503
},
{
"clip_ratio": 0.0,
"completion_length": 535.59375,
"epoch": 1.05,
"grad_norm": 3.8708770275115967,
"kl": 0.24755859375,
"learning_rate": 6.5e-07,
"loss": 0.0002,
"reward": 1.15625,
"reward_std": 0.3377464786171913,
"rewards/accuracy_reward": 0.40625,
"rewards/format_reward": 0.75,
"step": 504
},
{
"clip_ratio": 0.0,
"completion_length": 502.59375,
"epoch": 1.0520833333333333,
"grad_norm": 0.009273013100028038,
"kl": 0.2431640625,
"learning_rate": 6.493055555555556e-07,
"loss": 0.0002,
"reward": 0.75,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.5,
"step": 505
},
{
"clip_ratio": 0.0,
"completion_length": 548.78125,
"epoch": 1.0541666666666667,
"grad_norm": 1.5016146898269653,
"kl": 0.240234375,
"learning_rate": 6.486111111111111e-07,
"loss": 0.0002,
"reward": 0.8125,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.0625,
"rewards/format_reward": 0.75,
"step": 506
},
{
"clip_ratio": 0.0,
"completion_length": 529.96875,
"epoch": 1.05625,
"grad_norm": 1.255817174911499,
"kl": 0.255859375,
"learning_rate": 6.479166666666667e-07,
"loss": 0.0003,
"reward": 1.28125,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.53125,
"rewards/format_reward": 0.75,
"step": 507
},
{
"clip_ratio": 0.0,
"completion_length": 500.96875,
"epoch": 1.0583333333333333,
"grad_norm": 2.7088470458984375,
"kl": 0.26611328125,
"learning_rate": 6.472222222222222e-07,
"loss": 0.0003,
"reward": 0.875,
"reward_std": 0.2314550280570984,
"rewards/accuracy_reward": 0.375,
"rewards/format_reward": 0.5,
"step": 508
},
{
"clip_ratio": 0.0,
"completion_length": 516.90625,
"epoch": 1.0604166666666666,
"grad_norm": 1.8195452690124512,
"kl": 0.25927734375,
"learning_rate": 6.465277777777778e-07,
"loss": 0.0003,
"reward": 1.34375,
"reward_std": 0.22201896458864212,
"rewards/accuracy_reward": 0.34375,
"rewards/format_reward": 1.0,
"step": 509
},
{
"clip_ratio": 0.0,
"completion_length": 501.3125,
"epoch": 1.0625,
"grad_norm": 1.773526668548584,
"kl": 0.245361328125,
"learning_rate": 6.458333333333333e-07,
"loss": 0.0002,
"reward": 1.71875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.71875,
"rewards/format_reward": 1.0,
"step": 510
},
{
"clip_ratio": 0.0,
"completion_length": 479.90625,
"epoch": 1.0645833333333334,
"grad_norm": 1.7840821743011475,
"kl": 0.2734375,
"learning_rate": 6.451388888888889e-07,
"loss": 0.0003,
"reward": 1.15625,
"reward_std": 0.22201896458864212,
"rewards/accuracy_reward": 0.15625,
"rewards/format_reward": 1.0,
"step": 511
},
{
"clip_ratio": 0.0,
"completion_length": 455.40625,
"epoch": 1.0666666666666667,
"grad_norm": 2.6982321739196777,
"kl": 0.2841796875,
"learning_rate": 6.444444444444444e-07,
"loss": 0.0003,
"reward": 0.875,
"reward_std": 0.2177756354212761,
"rewards/accuracy_reward": 0.375,
"rewards/format_reward": 0.5,
"step": 512
},
{
"clip_ratio": 0.0,
"completion_length": 487.375,
"epoch": 1.06875,
"grad_norm": 1.1204781532287598,
"kl": 0.2626953125,
"learning_rate": 6.4375e-07,
"loss": 0.0003,
"reward": 0.875,
"reward_std": 0.13363061845302582,
"rewards/accuracy_reward": 0.375,
"rewards/format_reward": 0.5,
"step": 513
},
{
"clip_ratio": 0.0,
"completion_length": 461.46875,
"epoch": 1.0708333333333333,
"grad_norm": 1.2831130027770996,
"kl": 0.263671875,
"learning_rate": 6.430555555555555e-07,
"loss": 0.0003,
"reward": 0.6875,
"reward_std": 0.1157275140285492,
"rewards/accuracy_reward": 0.1875,
"rewards/format_reward": 0.5,
"step": 514
},
{
"clip_ratio": 0.0,
"completion_length": 434.28125,
"epoch": 1.0729166666666667,
"grad_norm": 2.457568407058716,
"kl": 0.26123046875,
"learning_rate": 6.423611111111112e-07,
"loss": 0.0003,
"reward": 1.875,
"reward_std": 0.2177756354212761,
"rewards/accuracy_reward": 0.875,
"rewards/format_reward": 1.0,
"step": 515
},
{
"clip_ratio": 0.0,
"completion_length": 484.15625,
"epoch": 1.075,
"grad_norm": 3.24466609954834,
"kl": 0.25537109375,
"learning_rate": 6.416666666666667e-07,
"loss": 0.0003,
"reward": 1.53125,
"reward_std": 0.3061639815568924,
"rewards/accuracy_reward": 0.53125,
"rewards/format_reward": 1.0,
"step": 516
},
{
"clip_ratio": 0.0,
"completion_length": 398.5625,
"epoch": 1.0770833333333334,
"grad_norm": 2.705453634262085,
"kl": 0.29150390625,
"learning_rate": 6.409722222222223e-07,
"loss": 0.0003,
"reward": 1.09375,
"reward_std": 0.3061639815568924,
"rewards/accuracy_reward": 0.34375,
"rewards/format_reward": 0.75,
"step": 517
},
{
"clip_ratio": 0.0,
"completion_length": 446.875,
"epoch": 1.0791666666666666,
"grad_norm": 1.0492925643920898,
"kl": 0.26123046875,
"learning_rate": 6.402777777777777e-07,
"loss": 0.0003,
"reward": 1.46875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.46875,
"rewards/format_reward": 1.0,
"step": 518
},
{
"clip_ratio": 0.0,
"completion_length": 401.84375,
"epoch": 1.08125,
"grad_norm": 2.4395976066589355,
"kl": 0.2763671875,
"learning_rate": 6.395833333333333e-07,
"loss": 0.0003,
"reward": 1.6875,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.6875,
"rewards/format_reward": 1.0,
"step": 519
},
{
"clip_ratio": 0.0,
"completion_length": 412.21875,
"epoch": 1.0833333333333333,
"grad_norm": 0.008863838389515877,
"kl": 0.27783203125,
"learning_rate": 6.388888888888888e-07,
"loss": 0.0003,
"reward": 1.5,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.75,
"step": 520
},
{
"clip_ratio": 0.0,
"completion_length": 405.125,
"epoch": 1.0854166666666667,
"grad_norm": 1.1511051654815674,
"kl": 0.26416015625,
"learning_rate": 6.381944444444444e-07,
"loss": 0.0003,
"reward": 0.40625,
"reward_std": 0.1293872892856598,
"rewards/accuracy_reward": 0.15625,
"rewards/format_reward": 0.25,
"step": 521
},
{
"clip_ratio": 0.0,
"completion_length": 385.125,
"epoch": 1.0875,
"grad_norm": 0.013866711407899857,
"kl": 0.287841796875,
"learning_rate": 6.374999999999999e-07,
"loss": 0.0003,
"reward": 0.5,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.25,
"step": 522
},
{
"clip_ratio": 0.0,
"completion_length": 385.40625,
"epoch": 1.0895833333333333,
"grad_norm": 1.231552004814148,
"kl": 0.28515625,
"learning_rate": 6.368055555555555e-07,
"loss": 0.0003,
"reward": 1.40625,
"reward_std": 0.1293872892856598,
"rewards/accuracy_reward": 0.65625,
"rewards/format_reward": 0.75,
"step": 523
},
{
"clip_ratio": 0.0,
"completion_length": 380.28125,
"epoch": 1.0916666666666666,
"grad_norm": 5.2175092697143555,
"kl": 0.28759765625,
"learning_rate": 6.36111111111111e-07,
"loss": 0.0003,
"reward": 1.21875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.46875,
"rewards/format_reward": 0.75,
"step": 524
},
{
"clip_ratio": 0.0,
"completion_length": 372.625,
"epoch": 1.09375,
"grad_norm": 16.90829086303711,
"kl": 0.31396484375,
"learning_rate": 6.354166666666666e-07,
"loss": 0.0003,
"reward": 1.46875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.71875,
"rewards/format_reward": 0.75,
"step": 525
},
{
"clip_ratio": 0.0,
"completion_length": 360.71875,
"epoch": 1.0958333333333334,
"grad_norm": 1.8970509767532349,
"kl": 0.29345703125,
"learning_rate": 6.347222222222221e-07,
"loss": 0.0003,
"reward": 1.34375,
"reward_std": 0.24511480331420898,
"rewards/accuracy_reward": 0.59375,
"rewards/format_reward": 0.75,
"step": 526
},
{
"clip_ratio": 0.0,
"completion_length": 355.28125,
"epoch": 1.0979166666666667,
"grad_norm": 1.165528416633606,
"kl": 0.3154296875,
"learning_rate": 6.340277777777777e-07,
"loss": 0.0003,
"reward": 1.96875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.96875,
"rewards/format_reward": 1.0,
"step": 527
},
{
"clip_ratio": 0.0,
"completion_length": 349.09375,
"epoch": 1.1,
"grad_norm": 1.5739704370498657,
"kl": 0.326171875,
"learning_rate": 6.333333333333332e-07,
"loss": 0.0003,
"reward": 1.28125,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.28125,
"rewards/format_reward": 1.0,
"step": 528
},
{
"clip_ratio": 0.0,
"completion_length": 341.78125,
"epoch": 1.1020833333333333,
"grad_norm": 2.9738852977752686,
"kl": 0.29833984375,
"learning_rate": 6.326388888888888e-07,
"loss": 0.0003,
"reward": 1.90625,
"reward_std": 0.1293872892856598,
"rewards/accuracy_reward": 0.90625,
"rewards/format_reward": 1.0,
"step": 529
},
{
"clip_ratio": 0.0,
"completion_length": 350.75,
"epoch": 1.1041666666666667,
"grad_norm": 0.0143277607858181,
"kl": 0.2939453125,
"learning_rate": 6.319444444444444e-07,
"loss": 0.0003,
"reward": 1.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.5,
"step": 530
},
{
"clip_ratio": 0.0,
"completion_length": 351.5,
"epoch": 1.10625,
"grad_norm": 2.137756824493408,
"kl": 0.31005859375,
"learning_rate": 6.3125e-07,
"loss": 0.0003,
"reward": 1.6875,
"reward_std": 0.2177756354212761,
"rewards/accuracy_reward": 0.6875,
"rewards/format_reward": 1.0,
"step": 531
},
{
"clip_ratio": 0.0,
"completion_length": 325.71875,
"epoch": 1.1083333333333334,
"grad_norm": 1.2957160472869873,
"kl": 0.32470703125,
"learning_rate": 6.305555555555555e-07,
"loss": 0.0003,
"reward": 1.4375,
"reward_std": 0.1157275140285492,
"rewards/accuracy_reward": 0.6875,
"rewards/format_reward": 0.75,
"step": 532
},
{
"clip_ratio": 0.0,
"completion_length": 326.4375,
"epoch": 1.1104166666666666,
"grad_norm": 2.687209367752075,
"kl": 0.33447265625,
"learning_rate": 6.298611111111111e-07,
"loss": 0.0003,
"reward": 1.71875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.71875,
"rewards/format_reward": 1.0,
"step": 533
},
{
"clip_ratio": 0.0,
"completion_length": 310.15625,
"epoch": 1.1125,
"grad_norm": 2.23334002494812,
"kl": 0.3720703125,
"learning_rate": 6.291666666666666e-07,
"loss": 0.0004,
"reward": 1.21875,
"reward_std": 0.2041158601641655,
"rewards/accuracy_reward": 0.46875,
"rewards/format_reward": 0.75,
"step": 534
},
{
"clip_ratio": 0.0,
"completion_length": 311.375,
"epoch": 1.1145833333333333,
"grad_norm": 2.0771384239196777,
"kl": 0.31884765625,
"learning_rate": 6.284722222222222e-07,
"loss": 0.0003,
"reward": 1.4375,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.6875,
"rewards/format_reward": 0.75,
"step": 535
},
{
"clip_ratio": 0.0,
"completion_length": 325.09375,
"epoch": 1.1166666666666667,
"grad_norm": 3.610607624053955,
"kl": 0.2958984375,
"learning_rate": 6.277777777777777e-07,
"loss": 0.0003,
"reward": 1.6875,
"reward_std": 0.1157275140285492,
"rewards/accuracy_reward": 0.6875,
"rewards/format_reward": 1.0,
"step": 536
},
{
"clip_ratio": 0.0,
"completion_length": 319.0,
"epoch": 1.11875,
"grad_norm": 2.343539237976074,
"kl": 0.3251953125,
"learning_rate": 6.270833333333333e-07,
"loss": 0.0003,
"reward": 1.71875,
"reward_std": 0.24511480331420898,
"rewards/accuracy_reward": 0.71875,
"rewards/format_reward": 1.0,
"step": 537
},
{
"clip_ratio": 0.0,
"completion_length": 312.25,
"epoch": 1.1208333333333333,
"grad_norm": 0.011663875542581081,
"kl": 0.32763671875,
"learning_rate": 6.263888888888888e-07,
"loss": 0.0003,
"reward": 0.5,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.25,
"step": 538
},
{
"clip_ratio": 0.0,
"completion_length": 319.78125,
"epoch": 1.1229166666666666,
"grad_norm": 1.8692626953125,
"kl": 0.31982421875,
"learning_rate": 6.256944444444444e-07,
"loss": 0.0003,
"reward": 1.46875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.71875,
"rewards/format_reward": 0.75,
"step": 539
},
{
"clip_ratio": 0.0,
"completion_length": 316.53125,
"epoch": 1.125,
"grad_norm": 1.2807447910308838,
"kl": 0.328125,
"learning_rate": 6.249999999999999e-07,
"loss": 0.0003,
"reward": 1.46875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.71875,
"rewards/format_reward": 0.75,
"step": 540
},
{
"clip_ratio": 0.0,
"completion_length": 316.71875,
"epoch": 1.1270833333333332,
"grad_norm": 1.4935797452926636,
"kl": 0.33740234375,
"learning_rate": 6.243055555555555e-07,
"loss": 0.0003,
"reward": 1.375,
"reward_std": 0.13363061845302582,
"rewards/accuracy_reward": 0.625,
"rewards/format_reward": 0.75,
"step": 541
},
{
"clip_ratio": 0.0,
"completion_length": 303.125,
"epoch": 1.1291666666666667,
"grad_norm": 1.423712968826294,
"kl": 0.34033203125,
"learning_rate": 6.23611111111111e-07,
"loss": 0.0003,
"reward": 1.90625,
"reward_std": 0.1293872892856598,
"rewards/accuracy_reward": 0.90625,
"rewards/format_reward": 1.0,
"step": 542
},
{
"clip_ratio": 0.0,
"completion_length": 302.28125,
"epoch": 1.13125,
"grad_norm": 1.2090955972671509,
"kl": 0.35791015625,
"learning_rate": 6.229166666666666e-07,
"loss": 0.0004,
"reward": 0.9375,
"reward_std": 0.1157275140285492,
"rewards/accuracy_reward": 0.4375,
"rewards/format_reward": 0.5,
"step": 543
},
{
"clip_ratio": 0.0,
"completion_length": 328.78125,
"epoch": 1.1333333333333333,
"grad_norm": 0.017780767753720284,
"kl": 0.337890625,
"learning_rate": 6.222222222222223e-07,
"loss": 0.0003,
"reward": 0.75,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.5,
"step": 544
},
{
"clip_ratio": 0.0,
"completion_length": 330.375,
"epoch": 1.1354166666666667,
"grad_norm": 0.01260452438145876,
"kl": 0.33544921875,
"learning_rate": 6.215277777777778e-07,
"loss": 0.0003,
"reward": 1.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.75,
"step": 545
},
{
"clip_ratio": 0.0,
"completion_length": 294.5625,
"epoch": 1.1375,
"grad_norm": 0.07112853229045868,
"kl": 0.40478515625,
"learning_rate": 6.208333333333334e-07,
"loss": 0.0004,
"reward": 1.5,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 1.0,
"step": 546
},
{
"clip_ratio": 0.0,
"completion_length": 292.90625,
"epoch": 1.1395833333333334,
"grad_norm": 0.010367254726588726,
"kl": 0.341796875,
"learning_rate": 6.201388888888889e-07,
"loss": 0.0003,
"reward": 1.5,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.75,
"step": 547
},
{
"clip_ratio": 0.0,
"completion_length": 312.90625,
"epoch": 1.1416666666666666,
"grad_norm": 0.013733302243053913,
"kl": 0.32421875,
"learning_rate": 6.194444444444445e-07,
"loss": 0.0003,
"reward": 0.75,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.75,
"step": 548
},
{
"clip_ratio": 0.0,
"completion_length": 311.03125,
"epoch": 1.14375,
"grad_norm": 2.095188856124878,
"kl": 0.33154296875,
"learning_rate": 6.1875e-07,
"loss": 0.0003,
"reward": 1.65625,
"reward_std": 0.2041158601641655,
"rewards/accuracy_reward": 0.65625,
"rewards/format_reward": 1.0,
"step": 549
},
{
"clip_ratio": 0.0,
"completion_length": 318.1875,
"epoch": 1.1458333333333333,
"grad_norm": 0.011102787218987942,
"kl": 0.31787109375,
"learning_rate": 6.180555555555556e-07,
"loss": 0.0003,
"reward": 1.5,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.75,
"step": 550
},
{
"clip_ratio": 0.0,
"completion_length": 309.59375,
"epoch": 1.1479166666666667,
"grad_norm": 0.011943946592509747,
"kl": 0.3408203125,
"learning_rate": 6.173611111111111e-07,
"loss": 0.0003,
"reward": 1.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.5,
"step": 551
},
{
"clip_ratio": 0.0,
"completion_length": 315.75,
"epoch": 1.15,
"grad_norm": 1.2020156383514404,
"kl": 0.32568359375,
"learning_rate": 6.166666666666667e-07,
"loss": 0.0003,
"reward": 0.46875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.21875,
"rewards/format_reward": 0.25,
"step": 552
},
{
"clip_ratio": 0.0,
"completion_length": 326.40625,
"epoch": 1.1520833333333333,
"grad_norm": 2.2801804542541504,
"kl": 0.34716796875,
"learning_rate": 6.159722222222222e-07,
"loss": 0.0003,
"reward": 1.3125,
"reward_std": 0.249358132481575,
"rewards/accuracy_reward": 0.5625,
"rewards/format_reward": 0.75,
"step": 553
},
{
"clip_ratio": 0.0,
"completion_length": 312.3125,
"epoch": 1.1541666666666668,
"grad_norm": 0.00982726365327835,
"kl": 0.32421875,
"learning_rate": 6.152777777777778e-07,
"loss": 0.0003,
"reward": 0.5,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.25,
"step": 554
},
{
"clip_ratio": 0.0,
"completion_length": 317.625,
"epoch": 1.15625,
"grad_norm": 2.7104454040527344,
"kl": 0.31591796875,
"learning_rate": 6.145833333333333e-07,
"loss": 0.0003,
"reward": 1.4375,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.6875,
"rewards/format_reward": 0.75,
"step": 555
},
{
"clip_ratio": 0.0,
"completion_length": 312.65625,
"epoch": 1.1583333333333332,
"grad_norm": 3.31844162940979,
"kl": 0.33349609375,
"learning_rate": 6.138888888888889e-07,
"loss": 0.0003,
"reward": 1.34375,
"reward_std": 0.1293872892856598,
"rewards/accuracy_reward": 0.59375,
"rewards/format_reward": 0.75,
"step": 556
},
{
"clip_ratio": 0.0,
"completion_length": 308.46875,
"epoch": 1.1604166666666667,
"grad_norm": 0.021200576797127724,
"kl": 0.373046875,
"learning_rate": 6.131944444444444e-07,
"loss": 0.0004,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"step": 557
},
{
"clip_ratio": 0.0,
"completion_length": 315.53125,
"epoch": 1.1625,
"grad_norm": 1.4671574831008911,
"kl": 0.333984375,
"learning_rate": 6.125000000000001e-07,
"loss": 0.0003,
"reward": 1.90625,
"reward_std": 0.1293872892856598,
"rewards/accuracy_reward": 0.90625,
"rewards/format_reward": 1.0,
"step": 558
},
{
"clip_ratio": 0.0,
"completion_length": 301.25,
"epoch": 1.1645833333333333,
"grad_norm": 0.013326307758688927,
"kl": 0.35498046875,
"learning_rate": 6.118055555555556e-07,
"loss": 0.0004,
"reward": 1.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.5,
"step": 559
},
{
"clip_ratio": 0.0,
"completion_length": 307.40625,
"epoch": 1.1666666666666667,
"grad_norm": 1.3512459993362427,
"kl": 0.35302734375,
"learning_rate": 6.111111111111112e-07,
"loss": 0.0004,
"reward": 1.1875,
"reward_std": 0.1157275140285492,
"rewards/accuracy_reward": 0.4375,
"rewards/format_reward": 0.75,
"step": 560
},
{
"clip_ratio": 0.0,
"completion_length": 313.90625,
"epoch": 1.16875,
"grad_norm": 1.6833953857421875,
"kl": 0.33740234375,
"learning_rate": 6.104166666666667e-07,
"loss": 0.0003,
"reward": 1.21875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.46875,
"rewards/format_reward": 0.75,
"step": 561
},
{
"clip_ratio": 0.0,
"completion_length": 320.6875,
"epoch": 1.1708333333333334,
"grad_norm": 0.08547837287187576,
"kl": 0.33349609375,
"learning_rate": 6.097222222222223e-07,
"loss": 0.0003,
"reward": 1.5,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.75,
"step": 562
},
{
"clip_ratio": 0.0,
"completion_length": 310.53125,
"epoch": 1.1729166666666666,
"grad_norm": 0.01078501995652914,
"kl": 0.32470703125,
"learning_rate": 6.090277777777777e-07,
"loss": 0.0003,
"reward": 1.5,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.75,
"step": 563
},
{
"clip_ratio": 0.0,
"completion_length": 296.25,
"epoch": 1.175,
"grad_norm": 1.2430768013000488,
"kl": 0.35400390625,
"learning_rate": 6.083333333333333e-07,
"loss": 0.0004,
"reward": 1.96875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.96875,
"rewards/format_reward": 1.0,
"step": 564
},
{
"clip_ratio": 0.0,
"completion_length": 321.9375,
"epoch": 1.1770833333333333,
"grad_norm": 1.3310391902923584,
"kl": 0.30419921875,
"learning_rate": 6.076388888888888e-07,
"loss": 0.0003,
"reward": 1.28125,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.28125,
"rewards/format_reward": 1.0,
"step": 565
},
{
"clip_ratio": 0.0,
"completion_length": 299.1875,
"epoch": 1.1791666666666667,
"grad_norm": 0.012161072343587875,
"kl": 0.32958984375,
"learning_rate": 6.069444444444444e-07,
"loss": 0.0003,
"reward": 1.5,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.75,
"step": 566
},
{
"clip_ratio": 0.0,
"completion_length": 324.65625,
"epoch": 1.18125,
"grad_norm": 0.011912676505744457,
"kl": 0.32958984375,
"learning_rate": 6.062499999999999e-07,
"loss": 0.0003,
"reward": 1.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.75,
"step": 567
},
{
"clip_ratio": 0.0,
"completion_length": 301.5625,
"epoch": 1.1833333333333333,
"grad_norm": 0.01320314034819603,
"kl": 0.3642578125,
"learning_rate": 6.055555555555555e-07,
"loss": 0.0004,
"reward": 0.75,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.5,
"step": 568
},
{
"clip_ratio": 0.0,
"completion_length": 303.46875,
"epoch": 1.1854166666666668,
"grad_norm": 3.1924169063568115,
"kl": 0.33984375,
"learning_rate": 6.04861111111111e-07,
"loss": 0.0003,
"reward": 1.65625,
"reward_std": 0.2651650384068489,
"rewards/accuracy_reward": 0.71875,
"rewards/format_reward": 0.9375,
"step": 569
},
{
"clip_ratio": 0.0,
"completion_length": 325.4375,
"epoch": 1.1875,
"grad_norm": 0.012552078813314438,
"kl": 0.32080078125,
"learning_rate": 6.041666666666666e-07,
"loss": 0.0003,
"reward": 2.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 1.0,
"rewards/format_reward": 1.0,
"step": 570
},
{
"clip_ratio": 0.0,
"completion_length": 304.125,
"epoch": 1.1895833333333332,
"grad_norm": 3.063969373703003,
"kl": 0.3232421875,
"learning_rate": 6.034722222222221e-07,
"loss": 0.0003,
"reward": 1.625,
"reward_std": 0.2177756354212761,
"rewards/accuracy_reward": 0.625,
"rewards/format_reward": 1.0,
"step": 571
},
{
"clip_ratio": 0.0,
"completion_length": 314.15625,
"epoch": 1.1916666666666667,
"grad_norm": 1.8180301189422607,
"kl": 0.3349609375,
"learning_rate": 6.027777777777778e-07,
"loss": 0.0003,
"reward": 1.96875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.96875,
"rewards/format_reward": 1.0,
"step": 572
},
{
"clip_ratio": 0.0,
"completion_length": 303.96875,
"epoch": 1.19375,
"grad_norm": 1.8491965532302856,
"kl": 0.33935546875,
"learning_rate": 6.020833333333333e-07,
"loss": 0.0003,
"reward": 0.28125,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.03125,
"rewards/format_reward": 0.25,
"step": 573
},
{
"clip_ratio": 0.0,
"completion_length": 308.6875,
"epoch": 1.1958333333333333,
"grad_norm": 1.7229721546173096,
"kl": 0.33203125,
"learning_rate": 6.013888888888889e-07,
"loss": 0.0003,
"reward": 1.03125,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.28125,
"rewards/format_reward": 0.75,
"step": 574
},
{
"clip_ratio": 0.0,
"completion_length": 302.625,
"epoch": 1.1979166666666667,
"grad_norm": 3.754805564880371,
"kl": 0.32080078125,
"learning_rate": 6.006944444444444e-07,
"loss": 0.0003,
"reward": 0.96875,
"reward_std": 0.2630179077386856,
"rewards/accuracy_reward": 0.21875,
"rewards/format_reward": 0.75,
"step": 575
},
{
"clip_ratio": 0.0,
"completion_length": 294.40625,
"epoch": 1.2,
"grad_norm": 1.3297032117843628,
"kl": 0.33056640625,
"learning_rate": 6e-07,
"loss": 0.0003,
"reward": 0.4375,
"reward_std": 0.1157275140285492,
"rewards/accuracy_reward": 0.1875,
"rewards/format_reward": 0.25,
"step": 576
},
{
"clip_ratio": 0.0,
"completion_length": 290.125,
"epoch": 1.2020833333333334,
"grad_norm": 1.3171557188034058,
"kl": 0.32080078125,
"learning_rate": 5.993055555555555e-07,
"loss": 0.0003,
"reward": 1.21875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.46875,
"rewards/format_reward": 0.75,
"step": 577
},
{
"clip_ratio": 0.0,
"completion_length": 292.28125,
"epoch": 1.2041666666666666,
"grad_norm": 0.010273904539644718,
"kl": 0.3369140625,
"learning_rate": 5.986111111111111e-07,
"loss": 0.0003,
"reward": 1.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.75,
"step": 578
},
{
"clip_ratio": 0.0,
"completion_length": 277.84375,
"epoch": 1.20625,
"grad_norm": 3.0093436241149902,
"kl": 0.3671875,
"learning_rate": 5.979166666666666e-07,
"loss": 0.0004,
"reward": 0.90625,
"reward_std": 0.1293872892856598,
"rewards/accuracy_reward": 0.40625,
"rewards/format_reward": 0.5,
"step": 579
},
{
"clip_ratio": 0.0,
"completion_length": 282.15625,
"epoch": 1.2083333333333333,
"grad_norm": 1.7877336740493774,
"kl": 0.36328125,
"learning_rate": 5.972222222222222e-07,
"loss": 0.0004,
"reward": 1.9375,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.9375,
"rewards/format_reward": 1.0,
"step": 580
},
{
"clip_ratio": 0.0,
"completion_length": 273.3125,
"epoch": 1.2104166666666667,
"grad_norm": 1.9994487762451172,
"kl": 0.34375,
"learning_rate": 5.965277777777777e-07,
"loss": 0.0003,
"reward": 1.25,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.75,
"step": 581
},
{
"clip_ratio": 0.0,
"completion_length": 249.65625,
"epoch": 1.2125,
"grad_norm": 1.301345705986023,
"kl": 0.37646484375,
"learning_rate": 5.958333333333333e-07,
"loss": 0.0004,
"reward": 1.78125,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.78125,
"rewards/format_reward": 1.0,
"step": 582
},
{
"clip_ratio": 0.0,
"completion_length": 282.96875,
"epoch": 1.2145833333333333,
"grad_norm": 2.8755271434783936,
"kl": 0.32958984375,
"learning_rate": 5.951388888888888e-07,
"loss": 0.0003,
"reward": 1.75,
"reward_std": 0.3335031494498253,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 1.0,
"step": 583
},
{
"clip_ratio": 0.0,
"completion_length": 268.9375,
"epoch": 1.2166666666666668,
"grad_norm": 2.305840253829956,
"kl": 0.37548828125,
"learning_rate": 5.944444444444444e-07,
"loss": 0.0004,
"reward": 1.25,
"reward_std": 0.2925042062997818,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.75,
"step": 584
},
{
"clip_ratio": 0.0,
"completion_length": 272.46875,
"epoch": 1.21875,
"grad_norm": 0.011355455964803696,
"kl": 0.33740234375,
"learning_rate": 5.937499999999999e-07,
"loss": 0.0003,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.25,
"step": 585
},
{
"clip_ratio": 0.0,
"completion_length": 279.375,
"epoch": 1.2208333333333332,
"grad_norm": 1.3942476511001587,
"kl": 0.3388671875,
"learning_rate": 5.930555555555556e-07,
"loss": 0.0003,
"reward": 1.1875,
"reward_std": 0.1157275140285492,
"rewards/accuracy_reward": 0.4375,
"rewards/format_reward": 0.75,
"step": 586
},
{
"clip_ratio": 0.0,
"completion_length": 284.28125,
"epoch": 1.2229166666666667,
"grad_norm": 1.5005948543548584,
"kl": 0.349609375,
"learning_rate": 5.923611111111111e-07,
"loss": 0.0003,
"reward": 1.09375,
"reward_std": 0.1293872892856598,
"rewards/accuracy_reward": 0.34375,
"rewards/format_reward": 0.75,
"step": 587
},
{
"clip_ratio": 0.0,
"completion_length": 270.5,
"epoch": 1.225,
"grad_norm": 0.01924080029129982,
"kl": 0.357421875,
"learning_rate": 5.916666666666667e-07,
"loss": 0.0004,
"reward": 1.5,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.75,
"step": 588
},
{
"clip_ratio": 0.0,
"completion_length": 267.71875,
"epoch": 1.2270833333333333,
"grad_norm": 1.5551037788391113,
"kl": 0.951171875,
"learning_rate": 5.909722222222222e-07,
"loss": 0.001,
"reward": 1.46875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.71875,
"rewards/format_reward": 0.75,
"step": 589
},
{
"clip_ratio": 0.0,
"completion_length": 274.03125,
"epoch": 1.2291666666666667,
"grad_norm": 1.4413371086120605,
"kl": 0.36376953125,
"learning_rate": 5.902777777777778e-07,
"loss": 0.0004,
"reward": 1.4375,
"reward_std": 0.1157275140285492,
"rewards/accuracy_reward": 0.6875,
"rewards/format_reward": 0.75,
"step": 590
},
{
"clip_ratio": 0.0,
"completion_length": 271.34375,
"epoch": 1.23125,
"grad_norm": 1.7674862146377563,
"kl": 0.373046875,
"learning_rate": 5.895833333333333e-07,
"loss": 0.0004,
"reward": 1.125,
"reward_std": 0.13363061845302582,
"rewards/accuracy_reward": 0.375,
"rewards/format_reward": 0.75,
"step": 591
},
{
"clip_ratio": 0.0,
"completion_length": 273.125,
"epoch": 1.2333333333333334,
"grad_norm": 8.082505226135254,
"kl": 0.3798828125,
"learning_rate": 5.888888888888889e-07,
"loss": 0.0004,
"reward": 1.09375,
"reward_std": 0.18600594997406006,
"rewards/accuracy_reward": 0.125,
"rewards/format_reward": 0.96875,
"step": 592
},
{
"clip_ratio": 0.0,
"completion_length": 259.65625,
"epoch": 1.2354166666666666,
"grad_norm": 1.4379527568817139,
"kl": 0.39208984375,
"learning_rate": 5.881944444444444e-07,
"loss": 0.0004,
"reward": 1.28125,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.28125,
"rewards/format_reward": 1.0,
"step": 593
},
{
"clip_ratio": 0.0,
"completion_length": 258.15625,
"epoch": 1.2375,
"grad_norm": 0.6245195269584656,
"kl": 0.38427734375,
"learning_rate": 5.875e-07,
"loss": 0.0004,
"reward": 1.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.75,
"step": 594
},
{
"clip_ratio": 0.0,
"completion_length": 260.8125,
"epoch": 1.2395833333333333,
"grad_norm": 0.010753520764410496,
"kl": 0.36669921875,
"learning_rate": 5.868055555555555e-07,
"loss": 0.0004,
"reward": 1.5,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.75,
"step": 595
},
{
"clip_ratio": 0.0,
"completion_length": 256.78125,
"epoch": 1.2416666666666667,
"grad_norm": 1.4395649433135986,
"kl": 0.3857421875,
"learning_rate": 5.861111111111111e-07,
"loss": 0.0004,
"reward": 0.71875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.21875,
"rewards/format_reward": 0.5,
"step": 596
},
{
"clip_ratio": 0.0,
"completion_length": 262.0625,
"epoch": 1.24375,
"grad_norm": 1.6355886459350586,
"kl": 0.365234375,
"learning_rate": 5.854166666666666e-07,
"loss": 0.0004,
"reward": 1.71875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.71875,
"rewards/format_reward": 1.0,
"step": 597
},
{
"clip_ratio": 0.0,
"completion_length": 270.3125,
"epoch": 1.2458333333333333,
"grad_norm": 0.01275028195232153,
"kl": 0.3564453125,
"learning_rate": 5.847222222222222e-07,
"loss": 0.0004,
"reward": 0.5,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.25,
"step": 598
},
{
"clip_ratio": 0.0,
"completion_length": 254.625,
"epoch": 1.2479166666666668,
"grad_norm": 1.4088735580444336,
"kl": 0.37939453125,
"learning_rate": 5.840277777777777e-07,
"loss": 0.0004,
"reward": 0.9375,
"reward_std": 0.1157275140285492,
"rewards/accuracy_reward": 0.4375,
"rewards/format_reward": 0.5,
"step": 599
},
{
"clip_ratio": 0.0,
"completion_length": 254.25,
"epoch": 1.25,
"grad_norm": 0.02105838432908058,
"kl": 0.3818359375,
"learning_rate": 5.833333333333334e-07,
"loss": 0.0004,
"reward": 1.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.5,
"step": 600
},
{
"clip_ratio": 0.0,
"completion_length": 258.0,
"epoch": 1.2520833333333332,
"grad_norm": 3.5220136642456055,
"kl": 0.37451171875,
"learning_rate": 5.826388888888889e-07,
"loss": 0.0004,
"reward": 1.34375,
"reward_std": 0.1293872892856598,
"rewards/accuracy_reward": 0.59375,
"rewards/format_reward": 0.75,
"step": 601
},
{
"clip_ratio": 0.0,
"completion_length": 252.4375,
"epoch": 1.2541666666666667,
"grad_norm": 0.013569949194788933,
"kl": 0.375,
"learning_rate": 5.819444444444445e-07,
"loss": 0.0004,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.25,
"step": 602
},
{
"clip_ratio": 0.0,
"completion_length": 252.5625,
"epoch": 1.25625,
"grad_norm": 1.4851642847061157,
"kl": 0.3671875,
"learning_rate": 5.8125e-07,
"loss": 0.0004,
"reward": 1.375,
"reward_std": 0.13363061845302582,
"rewards/accuracy_reward": 0.625,
"rewards/format_reward": 0.75,
"step": 603
},
{
"clip_ratio": 0.0,
"completion_length": 262.8125,
"epoch": 1.2583333333333333,
"grad_norm": 1.887512445449829,
"kl": 0.33935546875,
"learning_rate": 5.805555555555556e-07,
"loss": 0.0003,
"reward": 1.875,
"reward_std": 0.2177756354212761,
"rewards/accuracy_reward": 0.875,
"rewards/format_reward": 1.0,
"step": 604
},
{
"clip_ratio": 0.0,
"completion_length": 252.3125,
"epoch": 1.2604166666666667,
"grad_norm": 1.4296413660049438,
"kl": 0.3642578125,
"learning_rate": 5.798611111111111e-07,
"loss": 0.0004,
"reward": 1.3125,
"reward_std": 0.1157275140285492,
"rewards/accuracy_reward": 0.5625,
"rewards/format_reward": 0.75,
"step": 605
},
{
"clip_ratio": 0.0,
"completion_length": 239.375,
"epoch": 1.2625,
"grad_norm": 0.010707022622227669,
"kl": 0.35888671875,
"learning_rate": 5.791666666666667e-07,
"loss": 0.0004,
"reward": 0.75,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.5,
"step": 606
},
{
"clip_ratio": 0.0,
"completion_length": 248.625,
"epoch": 1.2645833333333334,
"grad_norm": 8.736783981323242,
"kl": 0.353515625,
"learning_rate": 5.784722222222222e-07,
"loss": 0.0004,
"reward": 0.625,
"reward_std": 0.13363061845302582,
"rewards/accuracy_reward": 0.125,
"rewards/format_reward": 0.5,
"step": 607
},
{
"clip_ratio": 0.0,
"completion_length": 252.25,
"epoch": 1.2666666666666666,
"grad_norm": 0.012715312652289867,
"kl": 0.36669921875,
"learning_rate": 5.777777777777777e-07,
"loss": 0.0004,
"reward": 0.5,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.25,
"step": 608
},
{
"clip_ratio": 0.0,
"completion_length": 249.34375,
"epoch": 1.26875,
"grad_norm": 1.8922051191329956,
"kl": 0.39208984375,
"learning_rate": 5.770833333333332e-07,
"loss": 0.0004,
"reward": 1.46875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.71875,
"rewards/format_reward": 0.75,
"step": 609
},
{
"clip_ratio": 0.0,
"completion_length": 258.375,
"epoch": 1.2708333333333333,
"grad_norm": 0.028345325961709023,
"kl": 0.36767578125,
"learning_rate": 5.763888888888888e-07,
"loss": 0.0004,
"reward": 1.75,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 1.0,
"step": 610
},
{
"clip_ratio": 0.0,
"completion_length": 243.96875,
"epoch": 1.2729166666666667,
"grad_norm": 0.015319216065108776,
"kl": 0.37646484375,
"learning_rate": 5.756944444444443e-07,
"loss": 0.0004,
"reward": 0.75,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.75,
"step": 611
},
{
"clip_ratio": 0.0,
"completion_length": 247.65625,
"epoch": 1.275,
"grad_norm": 2.6803572177886963,
"kl": 0.3662109375,
"learning_rate": 5.749999999999999e-07,
"loss": 0.0004,
"reward": 1.21875,
"reward_std": 0.2041158601641655,
"rewards/accuracy_reward": 0.46875,
"rewards/format_reward": 0.75,
"step": 612
},
{
"clip_ratio": 0.0,
"completion_length": 265.9375,
"epoch": 1.2770833333333333,
"grad_norm": 6.572545051574707,
"kl": 0.361328125,
"learning_rate": 5.743055555555554e-07,
"loss": 0.0004,
"reward": 1.78125,
"reward_std": 0.24511480331420898,
"rewards/accuracy_reward": 0.78125,
"rewards/format_reward": 1.0,
"step": 613
},
{
"clip_ratio": 0.0,
"completion_length": 246.75,
"epoch": 1.2791666666666668,
"grad_norm": 2.399791717529297,
"kl": 0.3876953125,
"learning_rate": 5.73611111111111e-07,
"loss": 0.0004,
"reward": 1.46875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.46875,
"rewards/format_reward": 1.0,
"step": 614
},
{
"clip_ratio": 0.0,
"completion_length": 268.625,
"epoch": 1.28125,
"grad_norm": 1.9824419021606445,
"kl": 0.388671875,
"learning_rate": 5.729166666666667e-07,
"loss": 0.0004,
"reward": 1.9375,
"reward_std": 0.1157275140285492,
"rewards/accuracy_reward": 0.9375,
"rewards/format_reward": 1.0,
"step": 615
},
{
"clip_ratio": 0.0,
"completion_length": 242.3125,
"epoch": 1.2833333333333332,
"grad_norm": 1.5071178674697876,
"kl": 0.3486328125,
"learning_rate": 5.722222222222222e-07,
"loss": 0.0003,
"reward": 1.5625,
"reward_std": 0.1157275140285492,
"rewards/accuracy_reward": 0.5625,
"rewards/format_reward": 1.0,
"step": 616
},
{
"clip_ratio": 0.0,
"completion_length": 258.65625,
"epoch": 1.2854166666666667,
"grad_norm": 0.010986674576997757,
"kl": 0.3466796875,
"learning_rate": 5.715277777777778e-07,
"loss": 0.0003,
"reward": 1.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.5,
"step": 617
},
{
"clip_ratio": 0.0,
"completion_length": 254.0625,
"epoch": 1.2875,
"grad_norm": 1.5354124307632446,
"kl": 0.3525390625,
"learning_rate": 5.708333333333333e-07,
"loss": 0.0004,
"reward": 1.78125,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.78125,
"rewards/format_reward": 1.0,
"step": 618
},
{
"clip_ratio": 0.0,
"completion_length": 244.96875,
"epoch": 1.2895833333333333,
"grad_norm": 2.0228993892669678,
"kl": 0.4052734375,
"learning_rate": 5.701388888888889e-07,
"loss": 0.0004,
"reward": 1.25,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.75,
"step": 619
},
{
"clip_ratio": 0.0,
"completion_length": 251.5625,
"epoch": 1.2916666666666667,
"grad_norm": 2.1687755584716797,
"kl": 0.3505859375,
"learning_rate": 5.694444444444444e-07,
"loss": 0.0004,
"reward": 1.21875,
"reward_std": 0.2041158601641655,
"rewards/accuracy_reward": 0.46875,
"rewards/format_reward": 0.75,
"step": 620
},
{
"clip_ratio": 0.0,
"completion_length": 268.8125,
"epoch": 1.29375,
"grad_norm": 2.1152918338775635,
"kl": 0.3662109375,
"learning_rate": 5.6875e-07,
"loss": 0.0004,
"reward": 1.78125,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.78125,
"rewards/format_reward": 1.0,
"step": 621
},
{
"clip_ratio": 0.0,
"completion_length": 273.78125,
"epoch": 1.2958333333333334,
"grad_norm": 0.011094697751104832,
"kl": 0.35546875,
"learning_rate": 5.680555555555555e-07,
"loss": 0.0004,
"reward": 1.5,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.75,
"step": 622
},
{
"clip_ratio": 0.0,
"completion_length": 249.28125,
"epoch": 1.2979166666666666,
"grad_norm": 0.02124650590121746,
"kl": 0.36181640625,
"learning_rate": 5.673611111111111e-07,
"loss": 0.0004,
"reward": 1.5,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.75,
"step": 623
},
{
"clip_ratio": 0.0,
"completion_length": 267.9375,
"epoch": 1.3,
"grad_norm": 0.012937084771692753,
"kl": 0.35693359375,
"learning_rate": 5.666666666666666e-07,
"loss": 0.0004,
"reward": 1.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.5,
"step": 624
},
{
"clip_ratio": 0.0,
"completion_length": 300.34375,
"epoch": 1.3020833333333333,
"grad_norm": 0.011210695840418339,
"kl": 0.32861328125,
"learning_rate": 5.659722222222222e-07,
"loss": 0.0003,
"reward": 1.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.5,
"step": 625
},
{
"clip_ratio": 0.0,
"completion_length": 290.375,
"epoch": 1.3041666666666667,
"grad_norm": 0.035728760063648224,
"kl": 0.34716796875,
"learning_rate": 5.652777777777777e-07,
"loss": 0.0003,
"reward": 1.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.5,
"step": 626
},
{
"clip_ratio": 0.0,
"completion_length": 265.625,
"epoch": 1.30625,
"grad_norm": 1.3605639934539795,
"kl": 0.353515625,
"learning_rate": 5.645833333333333e-07,
"loss": 0.0004,
"reward": 1.09375,
"reward_std": 0.1293872892856598,
"rewards/accuracy_reward": 0.34375,
"rewards/format_reward": 0.75,
"step": 627
},
{
"clip_ratio": 0.0,
"completion_length": 302.53125,
"epoch": 1.3083333333333333,
"grad_norm": 0.012040174566209316,
"kl": 0.3154296875,
"learning_rate": 5.638888888888888e-07,
"loss": 0.0003,
"reward": 1.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.5,
"step": 628
},
{
"clip_ratio": 0.0,
"completion_length": 280.96875,
"epoch": 1.3104166666666668,
"grad_norm": 0.009581932798027992,
"kl": 0.3369140625,
"learning_rate": 5.631944444444445e-07,
"loss": 0.0003,
"reward": 1.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.5,
"step": 629
},
{
"clip_ratio": 0.0,
"completion_length": 298.03125,
"epoch": 1.3125,
"grad_norm": 3.3300986289978027,
"kl": 0.3271484375,
"learning_rate": 5.625e-07,
"loss": 0.0003,
"reward": 1.15625,
"reward_std": 0.22201896458864212,
"rewards/accuracy_reward": 0.40625,
"rewards/format_reward": 0.75,
"step": 630
},
{
"clip_ratio": 0.0,
"completion_length": 287.59375,
"epoch": 1.3145833333333332,
"grad_norm": 1.4340507984161377,
"kl": 0.3232421875,
"learning_rate": 5.618055555555556e-07,
"loss": 0.0003,
"reward": 1.96875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.96875,
"rewards/format_reward": 1.0,
"step": 631
},
{
"clip_ratio": 0.0,
"completion_length": 285.09375,
"epoch": 1.3166666666666667,
"grad_norm": 0.015347130596637726,
"kl": 0.36181640625,
"learning_rate": 5.611111111111111e-07,
"loss": 0.0004,
"reward": 1.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.75,
"step": 632
},
{
"clip_ratio": 0.0,
"completion_length": 292.75,
"epoch": 1.31875,
"grad_norm": 0.16325514018535614,
"kl": 0.328125,
"learning_rate": 5.604166666666667e-07,
"loss": 0.0003,
"reward": 1.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.5,
"step": 633
},
{
"clip_ratio": 0.0,
"completion_length": 286.9375,
"epoch": 1.3208333333333333,
"grad_norm": 0.013178522698581219,
"kl": 0.34130859375,
"learning_rate": 5.597222222222222e-07,
"loss": 0.0003,
"reward": 0.75,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.5,
"step": 634
},
{
"clip_ratio": 0.0,
"completion_length": 299.375,
"epoch": 1.3229166666666667,
"grad_norm": 1.6165486574172974,
"kl": 0.34130859375,
"learning_rate": 5.590277777777778e-07,
"loss": 0.0003,
"reward": 1.46875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.71875,
"rewards/format_reward": 0.75,
"step": 635
},
{
"clip_ratio": 0.0,
"completion_length": 299.0625,
"epoch": 1.325,
"grad_norm": 0.013300875201821327,
"kl": 0.34521484375,
"learning_rate": 5.583333333333333e-07,
"loss": 0.0003,
"reward": 2.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 1.0,
"rewards/format_reward": 1.0,
"step": 636
},
{
"clip_ratio": 0.0,
"completion_length": 283.28125,
"epoch": 1.3270833333333334,
"grad_norm": 1.5510921478271484,
"kl": 0.318359375,
"learning_rate": 5.576388888888889e-07,
"loss": 0.0003,
"reward": 1.46875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.71875,
"rewards/format_reward": 0.75,
"step": 637
},
{
"clip_ratio": 0.0,
"completion_length": 297.03125,
"epoch": 1.3291666666666666,
"grad_norm": 1.1744096279144287,
"kl": 0.3017578125,
"learning_rate": 5.569444444444444e-07,
"loss": 0.0003,
"reward": 1.46875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.71875,
"rewards/format_reward": 0.75,
"step": 638
},
{
"clip_ratio": 0.0,
"completion_length": 300.8125,
"epoch": 1.33125,
"grad_norm": 0.009618040174245834,
"kl": 0.3095703125,
"learning_rate": 5.5625e-07,
"loss": 0.0003,
"reward": 0.75,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.5,
"step": 639
},
{
"clip_ratio": 0.0,
"completion_length": 307.09375,
"epoch": 1.3333333333333333,
"grad_norm": 0.017916277050971985,
"kl": 0.3583984375,
"learning_rate": 5.555555555555555e-07,
"loss": 0.0004,
"reward": 1.5,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.75,
"step": 640
},
{
"clip_ratio": 0.0,
"completion_length": 297.53125,
"epoch": 1.3354166666666667,
"grad_norm": 0.011587453074753284,
"kl": 0.32958984375,
"learning_rate": 5.548611111111111e-07,
"loss": 0.0003,
"reward": 1.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.5,
"step": 641
},
{
"clip_ratio": 0.0,
"completion_length": 304.40625,
"epoch": 1.3375,
"grad_norm": 1.2505037784576416,
"kl": 0.3388671875,
"learning_rate": 5.541666666666666e-07,
"loss": 0.0003,
"reward": 1.3125,
"reward_std": 0.1157275140285492,
"rewards/accuracy_reward": 0.5625,
"rewards/format_reward": 0.75,
"step": 642
},
{
"clip_ratio": 0.0,
"completion_length": 306.21875,
"epoch": 1.3395833333333333,
"grad_norm": 0.009628149680793285,
"kl": 0.3154296875,
"learning_rate": 5.534722222222223e-07,
"loss": 0.0003,
"reward": 1.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.5,
"step": 643
},
{
"clip_ratio": 0.0,
"completion_length": 298.90625,
"epoch": 1.3416666666666668,
"grad_norm": 0.012987801805138588,
"kl": 0.32666015625,
"learning_rate": 5.527777777777778e-07,
"loss": 0.0003,
"reward": 1.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.5,
"step": 644
},
{
"clip_ratio": 0.0,
"completion_length": 303.75,
"epoch": 1.34375,
"grad_norm": 0.009420250542461872,
"kl": 0.314453125,
"learning_rate": 5.520833333333334e-07,
"loss": 0.0003,
"reward": 1.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.75,
"step": 645
},
{
"clip_ratio": 0.0,
"completion_length": 284.8125,
"epoch": 1.3458333333333332,
"grad_norm": 0.012694254517555237,
"kl": 0.3291015625,
"learning_rate": 5.513888888888889e-07,
"loss": 0.0003,
"reward": 0.75,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.5,
"step": 646
},
{
"clip_ratio": 0.0,
"completion_length": 316.40625,
"epoch": 1.3479166666666667,
"grad_norm": 0.009924824349582195,
"kl": 0.30517578125,
"learning_rate": 5.506944444444445e-07,
"loss": 0.0003,
"reward": 1.5,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.75,
"step": 647
},
{
"clip_ratio": 0.0,
"completion_length": 318.9375,
"epoch": 1.35,
"grad_norm": 0.010647875256836414,
"kl": 0.31396484375,
"learning_rate": 5.5e-07,
"loss": 0.0003,
"reward": 1.5,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 1.0,
"step": 648
},
{
"clip_ratio": 0.0,
"completion_length": 315.1875,
"epoch": 1.3520833333333333,
"grad_norm": 1.8801367282867432,
"kl": 0.310546875,
"learning_rate": 5.493055555555556e-07,
"loss": 0.0003,
"reward": 1.25,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.75,
"step": 649
},
{
"clip_ratio": 0.0,
"completion_length": 323.875,
"epoch": 1.3541666666666667,
"grad_norm": 1.416724443435669,
"kl": 0.306640625,
"learning_rate": 5.486111111111111e-07,
"loss": 0.0003,
"reward": 0.9375,
"reward_std": 0.1157275140285492,
"rewards/accuracy_reward": 0.4375,
"rewards/format_reward": 0.5,
"step": 650
},
{
"clip_ratio": 0.0,
"completion_length": 316.875,
"epoch": 1.35625,
"grad_norm": 1.2208101749420166,
"kl": 0.31689453125,
"learning_rate": 5.479166666666667e-07,
"loss": 0.0003,
"reward": 1.46875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.71875,
"rewards/format_reward": 0.75,
"step": 651
},
{
"clip_ratio": 0.0,
"completion_length": 309.75,
"epoch": 1.3583333333333334,
"grad_norm": 1.3379855155944824,
"kl": 0.322265625,
"learning_rate": 5.472222222222222e-07,
"loss": 0.0003,
"reward": 1.4375,
"reward_std": 0.1157275140285492,
"rewards/accuracy_reward": 0.6875,
"rewards/format_reward": 0.75,
"step": 652
},
{
"clip_ratio": 0.0,
"completion_length": 301.28125,
"epoch": 1.3604166666666666,
"grad_norm": 0.012338054366409779,
"kl": 0.32177734375,
"learning_rate": 5.465277777777777e-07,
"loss": 0.0003,
"reward": 2.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 1.0,
"rewards/format_reward": 1.0,
"step": 653
},
{
"clip_ratio": 0.0,
"completion_length": 308.125,
"epoch": 1.3625,
"grad_norm": 1.8774542808532715,
"kl": 0.31494140625,
"learning_rate": 5.458333333333332e-07,
"loss": 0.0003,
"reward": 1.0,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.75,
"step": 654
},
{
"clip_ratio": 0.0,
"completion_length": 324.125,
"epoch": 1.3645833333333333,
"grad_norm": 1.324816107749939,
"kl": 0.30126953125,
"learning_rate": 5.451388888888888e-07,
"loss": 0.0003,
"reward": 0.3125,
"reward_std": 0.1157275140285492,
"rewards/accuracy_reward": 0.0625,
"rewards/format_reward": 0.25,
"step": 655
},
{
"clip_ratio": 0.0,
"completion_length": 308.375,
"epoch": 1.3666666666666667,
"grad_norm": 2.735501766204834,
"kl": 0.30419921875,
"learning_rate": 5.444444444444443e-07,
"loss": 0.0003,
"reward": 1.0625,
"reward_std": 0.2177756354212761,
"rewards/accuracy_reward": 0.3125,
"rewards/format_reward": 0.75,
"step": 656
},
{
"clip_ratio": 0.0,
"completion_length": 322.0625,
"epoch": 1.36875,
"grad_norm": 4.300278663635254,
"kl": 0.333984375,
"learning_rate": 5.4375e-07,
"loss": 0.0003,
"reward": 1.6875,
"reward_std": 0.2925042062997818,
"rewards/accuracy_reward": 0.6875,
"rewards/format_reward": 1.0,
"step": 657
},
{
"clip_ratio": 0.0,
"completion_length": 314.375,
"epoch": 1.3708333333333333,
"grad_norm": 0.017509862780570984,
"kl": 0.30517578125,
"learning_rate": 5.430555555555555e-07,
"loss": 0.0003,
"reward": 1.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.75,
"step": 658
},
{
"clip_ratio": 0.0,
"completion_length": 295.53125,
"epoch": 1.3729166666666668,
"grad_norm": 0.011455986648797989,
"kl": 0.33056640625,
"learning_rate": 5.423611111111111e-07,
"loss": 0.0003,
"reward": 1.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.5,
"step": 659
},
{
"clip_ratio": 0.0,
"completion_length": 313.8125,
"epoch": 1.375,
"grad_norm": 3.3278894424438477,
"kl": 0.326171875,
"learning_rate": 5.416666666666666e-07,
"loss": 0.0003,
"reward": 1.375,
"reward_std": 0.2314550280570984,
"rewards/accuracy_reward": 0.625,
"rewards/format_reward": 0.75,
"step": 660
},
{
"clip_ratio": 0.0,
"completion_length": 311.125,
"epoch": 1.3770833333333332,
"grad_norm": 1.3124842643737793,
"kl": 0.31884765625,
"learning_rate": 5.409722222222222e-07,
"loss": 0.0003,
"reward": 1.21875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.46875,
"rewards/format_reward": 0.75,
"step": 661
},
{
"clip_ratio": 0.0,
"completion_length": 323.625,
"epoch": 1.3791666666666667,
"grad_norm": 6.085054874420166,
"kl": 0.29736328125,
"learning_rate": 5.402777777777777e-07,
"loss": 0.0003,
"reward": 1.25,
"reward_std": 0.2925042062997818,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 1.0,
"step": 662
},
{
"clip_ratio": 0.0,
"completion_length": 306.3125,
"epoch": 1.38125,
"grad_norm": 0.010650291107594967,
"kl": 0.30908203125,
"learning_rate": 5.395833333333333e-07,
"loss": 0.0003,
"reward": 1.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.75,
"step": 663
},
{
"clip_ratio": 0.0,
"completion_length": 345.1875,
"epoch": 1.3833333333333333,
"grad_norm": 0.019253544509410858,
"kl": 0.3056640625,
"learning_rate": 5.388888888888888e-07,
"loss": 0.0003,
"reward": 1.5,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.75,
"step": 664
},
{
"clip_ratio": 0.0,
"completion_length": 313.0625,
"epoch": 1.3854166666666667,
"grad_norm": 1.2056688070297241,
"kl": 0.29931640625,
"learning_rate": 5.381944444444444e-07,
"loss": 0.0003,
"reward": 0.96875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.46875,
"rewards/format_reward": 0.5,
"step": 665
},
{
"clip_ratio": 0.0,
"completion_length": 358.15625,
"epoch": 1.3875,
"grad_norm": 0.012783597223460674,
"kl": 0.31298828125,
"learning_rate": 5.374999999999999e-07,
"loss": 0.0003,
"reward": 1.5,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.75,
"step": 666
},
{
"clip_ratio": 0.0,
"completion_length": 316.28125,
"epoch": 1.3895833333333334,
"grad_norm": 2.7306346893310547,
"kl": 0.3095703125,
"learning_rate": 5.368055555555555e-07,
"loss": 0.0003,
"reward": 1.40625,
"reward_std": 0.1293872892856598,
"rewards/accuracy_reward": 0.65625,
"rewards/format_reward": 0.75,
"step": 667
},
{
"clip_ratio": 0.0,
"completion_length": 329.71875,
"epoch": 1.3916666666666666,
"grad_norm": 1.4300445318222046,
"kl": 0.2900390625,
"learning_rate": 5.36111111111111e-07,
"loss": 0.0003,
"reward": 1.96875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.96875,
"rewards/format_reward": 1.0,
"step": 668
},
{
"clip_ratio": 0.0,
"completion_length": 330.03125,
"epoch": 1.39375,
"grad_norm": 5.7081193923950195,
"kl": 0.28271484375,
"learning_rate": 5.354166666666666e-07,
"loss": 0.0003,
"reward": 0.875,
"reward_std": 0.2177756354212761,
"rewards/accuracy_reward": 0.375,
"rewards/format_reward": 0.5,
"step": 669
},
{
"clip_ratio": 0.0,
"completion_length": 334.9375,
"epoch": 1.3958333333333333,
"grad_norm": 1.2781238555908203,
"kl": 0.31591796875,
"learning_rate": 5.347222222222221e-07,
"loss": 0.0003,
"reward": 1.4375,
"reward_std": 0.1157275140285492,
"rewards/accuracy_reward": 0.6875,
"rewards/format_reward": 0.75,
"step": 670
},
{
"clip_ratio": 0.0,
"completion_length": 354.84375,
"epoch": 1.3979166666666667,
"grad_norm": 1.4650616645812988,
"kl": 0.29638671875,
"learning_rate": 5.340277777777778e-07,
"loss": 0.0003,
"reward": 0.46875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.21875,
"rewards/format_reward": 0.25,
"step": 671
},
{
"clip_ratio": 0.0,
"completion_length": 338.78125,
"epoch": 1.4,
"grad_norm": 0.009217875078320503,
"kl": 0.30224609375,
"learning_rate": 5.333333333333333e-07,
"loss": 0.0003,
"reward": 1.5,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.75,
"step": 672
},
{
"clip_ratio": 0.0,
"completion_length": 324.53125,
"epoch": 1.4020833333333333,
"grad_norm": 1.2819490432739258,
"kl": 0.33154296875,
"learning_rate": 5.326388888888889e-07,
"loss": 0.0003,
"reward": 1.96875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.96875,
"rewards/format_reward": 1.0,
"step": 673
},
{
"clip_ratio": 0.0,
"completion_length": 323.34375,
"epoch": 1.4041666666666668,
"grad_norm": 0.014084805734455585,
"kl": 0.32080078125,
"learning_rate": 5.319444444444444e-07,
"loss": 0.0003,
"reward": 0.75,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.5,
"step": 674
},
{
"clip_ratio": 0.0,
"completion_length": 349.9375,
"epoch": 1.40625,
"grad_norm": 0.008332760073244572,
"kl": 0.28271484375,
"learning_rate": 5.3125e-07,
"loss": 0.0003,
"reward": 1.5,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.75,
"step": 675
},
{
"clip_ratio": 0.0,
"completion_length": 325.9375,
"epoch": 1.4083333333333332,
"grad_norm": 0.018420930951833725,
"kl": 0.32373046875,
"learning_rate": 5.305555555555555e-07,
"loss": 0.0003,
"reward": 1.5,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.75,
"step": 676
},
{
"clip_ratio": 0.0,
"completion_length": 332.625,
"epoch": 1.4104166666666667,
"grad_norm": 1.9230810403823853,
"kl": 0.30126953125,
"learning_rate": 5.298611111111111e-07,
"loss": 0.0003,
"reward": 1.1875,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.4375,
"rewards/format_reward": 0.75,
"step": 677
},
{
"clip_ratio": 0.0,
"completion_length": 361.53125,
"epoch": 1.4125,
"grad_norm": 0.009829960763454437,
"kl": 0.2998046875,
"learning_rate": 5.291666666666666e-07,
"loss": 0.0003,
"reward": 1.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.5,
"step": 678
},
{
"clip_ratio": 0.0,
"completion_length": 350.1875,
"epoch": 1.4145833333333333,
"grad_norm": 0.009316562674939632,
"kl": 0.2978515625,
"learning_rate": 5.284722222222222e-07,
"loss": 0.0003,
"reward": 2.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 1.0,
"rewards/format_reward": 1.0,
"step": 679
},
{
"clip_ratio": 0.0,
"completion_length": 373.125,
"epoch": 1.4166666666666667,
"grad_norm": 0.010273626074194908,
"kl": 0.28125,
"learning_rate": 5.277777777777777e-07,
"loss": 0.0003,
"reward": 1.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.75,
"step": 680
},
{
"clip_ratio": 0.0,
"completion_length": 340.875,
"epoch": 1.41875,
"grad_norm": 0.018658263608813286,
"kl": 0.3154296875,
"learning_rate": 5.270833333333333e-07,
"loss": 0.0003,
"reward": 0.75,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.5,
"step": 681
},
{
"clip_ratio": 0.0,
"completion_length": 374.53125,
"epoch": 1.4208333333333334,
"grad_norm": 1.121277928352356,
"kl": 0.29736328125,
"learning_rate": 5.263888888888888e-07,
"loss": 0.0003,
"reward": 1.28125,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.53125,
"rewards/format_reward": 0.75,
"step": 682
},
{
"clip_ratio": 0.0,
"completion_length": 384.21875,
"epoch": 1.4229166666666666,
"grad_norm": 0.0887700691819191,
"kl": 0.2900390625,
"learning_rate": 5.256944444444444e-07,
"loss": 0.0003,
"reward": 1.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.5,
"step": 683
},
{
"clip_ratio": 0.0,
"completion_length": 371.84375,
"epoch": 1.425,
"grad_norm": 0.009055635891854763,
"kl": 0.2861328125,
"learning_rate": 5.25e-07,
"loss": 0.0003,
"reward": 1.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.5,
"step": 684
},
{
"clip_ratio": 0.0,
"completion_length": 374.34375,
"epoch": 1.4270833333333333,
"grad_norm": 1.2657846212387085,
"kl": 0.296875,
"learning_rate": 5.243055555555556e-07,
"loss": 0.0003,
"reward": 1.96875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.96875,
"rewards/format_reward": 1.0,
"step": 685
},
{
"clip_ratio": 0.0,
"completion_length": 359.65625,
"epoch": 1.4291666666666667,
"grad_norm": 1.931689739227295,
"kl": 0.28369140625,
"learning_rate": 5.236111111111112e-07,
"loss": 0.0003,
"reward": 1.09375,
"reward_std": 0.2041158601641655,
"rewards/accuracy_reward": 0.34375,
"rewards/format_reward": 0.75,
"step": 686
},
{
"clip_ratio": 0.0,
"completion_length": 393.28125,
"epoch": 1.43125,
"grad_norm": 1.098535418510437,
"kl": 0.25341796875,
"learning_rate": 5.229166666666667e-07,
"loss": 0.0003,
"reward": 1.4375,
"reward_std": 0.1157275140285492,
"rewards/accuracy_reward": 0.6875,
"rewards/format_reward": 0.75,
"step": 687
},
{
"clip_ratio": 0.0,
"completion_length": 382.15625,
"epoch": 1.4333333333333333,
"grad_norm": 0.008603488095104694,
"kl": 0.28125,
"learning_rate": 5.222222222222223e-07,
"loss": 0.0003,
"reward": 0.75,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.5,
"step": 688
},
{
"clip_ratio": 0.0,
"completion_length": 412.46875,
"epoch": 1.4354166666666668,
"grad_norm": 1.1769787073135376,
"kl": 0.254150390625,
"learning_rate": 5.215277777777778e-07,
"loss": 0.0003,
"reward": 1.84375,
"reward_std": 0.1293872892856598,
"rewards/accuracy_reward": 0.84375,
"rewards/format_reward": 1.0,
"step": 689
},
{
"clip_ratio": 0.0,
"completion_length": 412.78125,
"epoch": 1.4375,
"grad_norm": 0.009760401211678982,
"kl": 0.2783203125,
"learning_rate": 5.208333333333334e-07,
"loss": 0.0003,
"reward": 1.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.75,
"step": 690
},
{
"clip_ratio": 0.0,
"completion_length": 410.53125,
"epoch": 1.4395833333333332,
"grad_norm": 0.009675565175712109,
"kl": 0.26318359375,
"learning_rate": 5.201388888888889e-07,
"loss": 0.0003,
"reward": 1.75,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 1.0,
"step": 691
},
{
"clip_ratio": 0.0,
"completion_length": 391.59375,
"epoch": 1.4416666666666667,
"grad_norm": 0.03820687532424927,
"kl": 0.260498046875,
"learning_rate": 5.194444444444445e-07,
"loss": 0.0003,
"reward": 2.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 1.0,
"rewards/format_reward": 1.0,
"step": 692
},
{
"clip_ratio": 0.0,
"completion_length": 440.03125,
"epoch": 1.44375,
"grad_norm": 0.00820316094905138,
"kl": 0.249755859375,
"learning_rate": 5.1875e-07,
"loss": 0.0002,
"reward": 1.5,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.75,
"step": 693
},
{
"clip_ratio": 0.0,
"completion_length": 426.1875,
"epoch": 1.4458333333333333,
"grad_norm": 0.00827599223703146,
"kl": 0.254638671875,
"learning_rate": 5.180555555555556e-07,
"loss": 0.0003,
"reward": 0.5,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.25,
"step": 694
},
{
"clip_ratio": 0.0,
"completion_length": 413.3125,
"epoch": 1.4479166666666667,
"grad_norm": 1.2021926641464233,
"kl": 0.2626953125,
"learning_rate": 5.173611111111111e-07,
"loss": 0.0003,
"reward": 1.03125,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.28125,
"rewards/format_reward": 0.75,
"step": 695
},
{
"clip_ratio": 0.0,
"completion_length": 399.59375,
"epoch": 1.45,
"grad_norm": 2.0005180835723877,
"kl": 0.2646484375,
"learning_rate": 5.166666666666667e-07,
"loss": 0.0003,
"reward": 1.125,
"reward_std": 0.2925042062997818,
"rewards/accuracy_reward": 0.40625,
"rewards/format_reward": 0.71875,
"step": 696
},
{
"clip_ratio": 0.0,
"completion_length": 451.5,
"epoch": 1.4520833333333334,
"grad_norm": 1.7120201587677002,
"kl": 0.232177734375,
"learning_rate": 5.159722222222222e-07,
"loss": 0.0002,
"reward": 1.46875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.71875,
"rewards/format_reward": 0.75,
"step": 697
},
{
"clip_ratio": 0.0,
"completion_length": 410.46875,
"epoch": 1.4541666666666666,
"grad_norm": 4.130648136138916,
"kl": 0.2626953125,
"learning_rate": 5.152777777777777e-07,
"loss": 0.0003,
"reward": 1.46875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.71875,
"rewards/format_reward": 0.75,
"step": 698
},
{
"clip_ratio": 0.0,
"completion_length": 443.03125,
"epoch": 1.45625,
"grad_norm": 0.008720851503312588,
"kl": 0.248779296875,
"learning_rate": 5.145833333333332e-07,
"loss": 0.0002,
"reward": 1.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.5,
"step": 699
},
{
"clip_ratio": 0.0,
"completion_length": 435.875,
"epoch": 1.4583333333333333,
"grad_norm": 0.007419127505272627,
"kl": 0.248046875,
"learning_rate": 5.138888888888889e-07,
"loss": 0.0002,
"reward": 1.5,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.75,
"step": 700
},
{
"clip_ratio": 0.0,
"completion_length": 458.8125,
"epoch": 1.4604166666666667,
"grad_norm": 0.010546655394136906,
"kl": 0.256591796875,
"learning_rate": 5.131944444444444e-07,
"loss": 0.0003,
"reward": 1.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.75,
"step": 701
},
{
"clip_ratio": 0.0,
"completion_length": 458.5625,
"epoch": 1.4625,
"grad_norm": 0.007757688872516155,
"kl": 0.258544921875,
"learning_rate": 5.125e-07,
"loss": 0.0003,
"reward": 0.5,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.25,
"step": 702
},
{
"clip_ratio": 0.0,
"completion_length": 443.3125,
"epoch": 1.4645833333333333,
"grad_norm": 1.2824113368988037,
"kl": 0.240234375,
"learning_rate": 5.118055555555555e-07,
"loss": 0.0002,
"reward": 1.78125,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.78125,
"rewards/format_reward": 1.0,
"step": 703
},
{
"clip_ratio": 0.0,
"completion_length": 440.53125,
"epoch": 1.4666666666666668,
"grad_norm": 0.010234753601253033,
"kl": 0.2685546875,
"learning_rate": 5.111111111111111e-07,
"loss": 0.0003,
"reward": 1.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.5,
"step": 704
},
{
"clip_ratio": 0.0,
"completion_length": 449.8125,
"epoch": 1.46875,
"grad_norm": 0.020811883732676506,
"kl": 0.262939453125,
"learning_rate": 5.104166666666666e-07,
"loss": 0.0003,
"reward": 1.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.75,
"step": 705
},
{
"clip_ratio": 0.0,
"completion_length": 480.28125,
"epoch": 1.4708333333333332,
"grad_norm": 0.017371410503983498,
"kl": 0.24267578125,
"learning_rate": 5.097222222222222e-07,
"loss": 0.0002,
"reward": 1.75,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 1.0,
"step": 706
},
{
"clip_ratio": 0.0,
"completion_length": 446.5,
"epoch": 1.4729166666666667,
"grad_norm": 0.008478788658976555,
"kl": 0.246826171875,
"learning_rate": 5.090277777777777e-07,
"loss": 0.0002,
"reward": 1.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.75,
"step": 707
},
{
"clip_ratio": 0.0,
"completion_length": 444.0625,
"epoch": 1.475,
"grad_norm": 1.548750877380371,
"kl": 0.252685546875,
"learning_rate": 5.083333333333333e-07,
"loss": 0.0003,
"reward": 1.40625,
"reward_std": 0.2041158601641655,
"rewards/accuracy_reward": 0.65625,
"rewards/format_reward": 0.75,
"step": 708
},
{
"clip_ratio": 0.0,
"completion_length": 460.4375,
"epoch": 1.4770833333333333,
"grad_norm": 1.1799025535583496,
"kl": 0.25927734375,
"learning_rate": 5.076388888888888e-07,
"loss": 0.0003,
"reward": 1.28125,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.53125,
"rewards/format_reward": 0.75,
"step": 709
},
{
"clip_ratio": 0.0,
"completion_length": 437.78125,
"epoch": 1.4791666666666667,
"grad_norm": 7.808667182922363,
"kl": 0.256591796875,
"learning_rate": 5.069444444444444e-07,
"loss": 0.0003,
"reward": 1.03125,
"reward_std": 0.2041158601641655,
"rewards/accuracy_reward": 0.28125,
"rewards/format_reward": 0.75,
"step": 710
},
{
"clip_ratio": 0.0,
"completion_length": 416.9375,
"epoch": 1.48125,
"grad_norm": 0.009599937126040459,
"kl": 0.27587890625,
"learning_rate": 5.062499999999999e-07,
"loss": 0.0003,
"reward": 1.5,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.75,
"step": 711
},
{
"clip_ratio": 0.0,
"completion_length": 449.3125,
"epoch": 1.4833333333333334,
"grad_norm": 0.014995035715401173,
"kl": 0.254638671875,
"learning_rate": 5.055555555555555e-07,
"loss": 0.0003,
"reward": 2.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 1.0,
"rewards/format_reward": 1.0,
"step": 712
},
{
"clip_ratio": 0.0,
"completion_length": 439.21875,
"epoch": 1.4854166666666666,
"grad_norm": 0.008672283962368965,
"kl": 0.25390625,
"learning_rate": 5.04861111111111e-07,
"loss": 0.0003,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"step": 713
},
{
"clip_ratio": 0.0,
"completion_length": 426.34375,
"epoch": 1.4875,
"grad_norm": 1.2915209531784058,
"kl": 0.2451171875,
"learning_rate": 5.041666666666667e-07,
"loss": 0.0002,
"reward": 1.4375,
"reward_std": 0.1157275140285492,
"rewards/accuracy_reward": 0.6875,
"rewards/format_reward": 0.75,
"step": 714
},
{
"clip_ratio": 0.0,
"completion_length": 427.6875,
"epoch": 1.4895833333333333,
"grad_norm": 0.011465529911220074,
"kl": 0.26611328125,
"learning_rate": 5.034722222222222e-07,
"loss": 0.0003,
"reward": 1.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.5,
"step": 715
},
{
"clip_ratio": 0.0,
"completion_length": 435.0,
"epoch": 1.4916666666666667,
"grad_norm": 1.1036456823349,
"kl": 0.28173828125,
"learning_rate": 5.027777777777778e-07,
"loss": 0.0003,
"reward": 0.96875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.46875,
"rewards/format_reward": 0.5,
"step": 716
},
{
"clip_ratio": 0.0,
"completion_length": 463.15625,
"epoch": 1.49375,
"grad_norm": 1.2302424907684326,
"kl": 0.2587890625,
"learning_rate": 5.020833333333333e-07,
"loss": 0.0003,
"reward": 1.8125,
"reward_std": 0.1157275140285492,
"rewards/accuracy_reward": 0.8125,
"rewards/format_reward": 1.0,
"step": 717
},
{
"clip_ratio": 0.0,
"completion_length": 453.78125,
"epoch": 1.4958333333333333,
"grad_norm": 2.1474807262420654,
"kl": 0.240966796875,
"learning_rate": 5.013888888888889e-07,
"loss": 0.0002,
"reward": 1.4375,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.6875,
"rewards/format_reward": 0.75,
"step": 718
},
{
"clip_ratio": 0.0,
"completion_length": 398.46875,
"epoch": 1.4979166666666668,
"grad_norm": 0.00977454986423254,
"kl": 0.26806640625,
"learning_rate": 5.006944444444444e-07,
"loss": 0.0003,
"reward": 0.75,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.5,
"step": 719
},
{
"clip_ratio": 0.0,
"completion_length": 419.625,
"epoch": 1.5,
"grad_norm": 1.7190418243408203,
"kl": 0.2734375,
"learning_rate": 5e-07,
"loss": 0.0003,
"reward": 0.71875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.21875,
"rewards/format_reward": 0.5,
"step": 720
},
{
"clip_ratio": 0.0,
"completion_length": 418.5,
"epoch": 1.5020833333333332,
"grad_norm": 1.1904847621917725,
"kl": 0.272705078125,
"learning_rate": 4.993055555555555e-07,
"loss": 0.0003,
"reward": 1.28125,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.53125,
"rewards/format_reward": 0.75,
"step": 721
},
{
"clip_ratio": 0.0,
"completion_length": 419.1875,
"epoch": 1.5041666666666667,
"grad_norm": 0.009021605364978313,
"kl": 0.261474609375,
"learning_rate": 4.986111111111111e-07,
"loss": 0.0003,
"reward": 1.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.5,
"step": 722
},
{
"clip_ratio": 0.0,
"completion_length": 418.375,
"epoch": 1.50625,
"grad_norm": 2.251453399658203,
"kl": 0.26171875,
"learning_rate": 4.979166666666666e-07,
"loss": 0.0003,
"reward": 1.03125,
"reward_std": 0.2651650384068489,
"rewards/accuracy_reward": 0.28125,
"rewards/format_reward": 0.75,
"step": 723
},
{
"clip_ratio": 0.0,
"completion_length": 423.65625,
"epoch": 1.5083333333333333,
"grad_norm": 0.008535100147128105,
"kl": 0.2646484375,
"learning_rate": 4.972222222222222e-07,
"loss": 0.0003,
"reward": 0.5,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.25,
"step": 724
},
{
"clip_ratio": 0.0,
"completion_length": 439.71875,
"epoch": 1.5104166666666665,
"grad_norm": 1.3646361827850342,
"kl": 0.249755859375,
"learning_rate": 4.965277777777777e-07,
"loss": 0.0002,
"reward": 1.46875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.71875,
"rewards/format_reward": 0.75,
"step": 725
},
{
"clip_ratio": 0.0,
"completion_length": 392.09375,
"epoch": 1.5125,
"grad_norm": 1.2811659574508667,
"kl": 0.275390625,
"learning_rate": 4.958333333333333e-07,
"loss": 0.0003,
"reward": 1.46875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.71875,
"rewards/format_reward": 0.75,
"step": 726
},
{
"clip_ratio": 0.0,
"completion_length": 428.65625,
"epoch": 1.5145833333333334,
"grad_norm": 1.3191869258880615,
"kl": 0.2470703125,
"learning_rate": 4.951388888888889e-07,
"loss": 0.0002,
"reward": 1.46875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.71875,
"rewards/format_reward": 0.75,
"step": 727
},
{
"clip_ratio": 0.0,
"completion_length": 433.78125,
"epoch": 1.5166666666666666,
"grad_norm": 0.009647144004702568,
"kl": 0.262451171875,
"learning_rate": 4.944444444444445e-07,
"loss": 0.0003,
"reward": 1.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.75,
"step": 728
},
{
"clip_ratio": 0.0,
"completion_length": 408.34375,
"epoch": 1.51875,
"grad_norm": 1.3683720827102661,
"kl": 0.263671875,
"learning_rate": 4.9375e-07,
"loss": 0.0003,
"reward": 1.34375,
"reward_std": 0.1293872892856598,
"rewards/accuracy_reward": 0.59375,
"rewards/format_reward": 0.75,
"step": 729
},
{
"clip_ratio": 0.0,
"completion_length": 445.59375,
"epoch": 1.5208333333333335,
"grad_norm": 1.4309377670288086,
"kl": 0.263427734375,
"learning_rate": 4.930555555555556e-07,
"loss": 0.0003,
"reward": 1.46875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.71875,
"rewards/format_reward": 0.75,
"step": 730
},
{
"clip_ratio": 0.0,
"completion_length": 426.78125,
"epoch": 1.5229166666666667,
"grad_norm": 0.009884395636618137,
"kl": 0.243408203125,
"learning_rate": 4.923611111111111e-07,
"loss": 0.0002,
"reward": 1.5,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.75,
"step": 731
},
{
"clip_ratio": 0.0,
"completion_length": 404.0625,
"epoch": 1.525,
"grad_norm": 0.011238432489335537,
"kl": 0.274169921875,
"learning_rate": 4.916666666666666e-07,
"loss": 0.0003,
"reward": 1.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.5,
"step": 732
},
{
"clip_ratio": 0.0,
"completion_length": 428.6875,
"epoch": 1.5270833333333333,
"grad_norm": 0.008788060396909714,
"kl": 0.25244140625,
"learning_rate": 4.909722222222221e-07,
"loss": 0.0003,
"reward": 2.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 1.0,
"rewards/format_reward": 1.0,
"step": 733
},
{
"clip_ratio": 0.0,
"completion_length": 448.5625,
"epoch": 1.5291666666666668,
"grad_norm": 0.026375016197562218,
"kl": 0.2587890625,
"learning_rate": 4.902777777777777e-07,
"loss": 0.0003,
"reward": 1.5,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.75,
"step": 734
},
{
"clip_ratio": 0.0,
"completion_length": 418.0,
"epoch": 1.53125,
"grad_norm": 0.00788712315261364,
"kl": 0.2666015625,
"learning_rate": 4.895833333333333e-07,
"loss": 0.0003,
"reward": 1.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.75,
"step": 735
},
{
"clip_ratio": 0.0,
"completion_length": 436.71875,
"epoch": 1.5333333333333332,
"grad_norm": 0.012409915216267109,
"kl": 0.248046875,
"learning_rate": 4.888888888888889e-07,
"loss": 0.0002,
"reward": 2.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 1.0,
"rewards/format_reward": 1.0,
"step": 736
},
{
"clip_ratio": 0.0,
"completion_length": 405.75,
"epoch": 1.5354166666666667,
"grad_norm": 1.6673911809921265,
"kl": 0.268310546875,
"learning_rate": 4.881944444444444e-07,
"loss": 0.0003,
"reward": 1.21875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.46875,
"rewards/format_reward": 0.75,
"step": 737
},
{
"clip_ratio": 0.0,
"completion_length": 426.15625,
"epoch": 1.5375,
"grad_norm": 14.521210670471191,
"kl": 0.26953125,
"learning_rate": 4.875e-07,
"loss": 0.0003,
"reward": 0.71875,
"reward_std": 0.2041158601641655,
"rewards/accuracy_reward": 0.21875,
"rewards/format_reward": 0.5,
"step": 738
},
{
"clip_ratio": 0.0,
"completion_length": 402.6875,
"epoch": 1.5395833333333333,
"grad_norm": 0.008589212782680988,
"kl": 0.271484375,
"learning_rate": 4.868055555555555e-07,
"loss": 0.0003,
"reward": 2.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 1.0,
"rewards/format_reward": 1.0,
"step": 739
},
{
"clip_ratio": 0.0,
"completion_length": 439.5625,
"epoch": 1.5416666666666665,
"grad_norm": 0.00834252592176199,
"kl": 0.24853515625,
"learning_rate": 4.861111111111111e-07,
"loss": 0.0002,
"reward": 1.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.5,
"step": 740
},
{
"clip_ratio": 0.0,
"completion_length": 456.375,
"epoch": 1.54375,
"grad_norm": 0.0074338242411613464,
"kl": 0.25732421875,
"learning_rate": 4.854166666666666e-07,
"loss": 0.0003,
"reward": 1.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.75,
"step": 741
},
{
"clip_ratio": 0.0,
"completion_length": 439.125,
"epoch": 1.5458333333333334,
"grad_norm": 0.00910673663020134,
"kl": 0.260986328125,
"learning_rate": 4.847222222222222e-07,
"loss": 0.0003,
"reward": 1.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.75,
"step": 742
},
{
"clip_ratio": 0.0,
"completion_length": 439.125,
"epoch": 1.5479166666666666,
"grad_norm": 1.3869645595550537,
"kl": 0.267333984375,
"learning_rate": 4.840277777777777e-07,
"loss": 0.0003,
"reward": 0.71875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.21875,
"rewards/format_reward": 0.5,
"step": 743
},
{
"clip_ratio": 0.0,
"completion_length": 405.84375,
"epoch": 1.55,
"grad_norm": 2.4322197437286377,
"kl": 0.267822265625,
"learning_rate": 4.833333333333333e-07,
"loss": 0.0003,
"reward": 1.96875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.96875,
"rewards/format_reward": 1.0,
"step": 744
},
{
"clip_ratio": 0.0,
"completion_length": 429.34375,
"epoch": 1.5520833333333335,
"grad_norm": 1.4897021055221558,
"kl": 0.237548828125,
"learning_rate": 4.826388888888888e-07,
"loss": 0.0002,
"reward": 1.21875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.46875,
"rewards/format_reward": 0.75,
"step": 745
},
{
"clip_ratio": 0.0,
"completion_length": 453.5625,
"epoch": 1.5541666666666667,
"grad_norm": 1.5762288570404053,
"kl": 0.23681640625,
"learning_rate": 4.819444444444444e-07,
"loss": 0.0002,
"reward": 1.28125,
"reward_std": 0.2041158601641655,
"rewards/accuracy_reward": 0.53125,
"rewards/format_reward": 0.75,
"step": 746
},
{
"clip_ratio": 0.0,
"completion_length": 444.53125,
"epoch": 1.55625,
"grad_norm": 0.022547965869307518,
"kl": 0.251220703125,
"learning_rate": 4.812499999999999e-07,
"loss": 0.0003,
"reward": 1.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.5,
"step": 747
},
{
"clip_ratio": 0.0,
"completion_length": 460.375,
"epoch": 1.5583333333333333,
"grad_norm": 0.009397338144481182,
"kl": 0.25732421875,
"learning_rate": 4.805555555555555e-07,
"loss": 0.0003,
"reward": 1.5,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.75,
"step": 748
},
{
"clip_ratio": 0.0,
"completion_length": 452.625,
"epoch": 1.5604166666666668,
"grad_norm": 0.013823941349983215,
"kl": 0.2705078125,
"learning_rate": 4.798611111111112e-07,
"loss": 0.0003,
"reward": 1.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.5,
"step": 749
},
{
"clip_ratio": 0.0,
"completion_length": 467.40625,
"epoch": 1.5625,
"grad_norm": 0.008257027715444565,
"kl": 0.24951171875,
"learning_rate": 4.791666666666667e-07,
"loss": 0.0002,
"reward": 0.75,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.5,
"step": 750
},
{
"clip_ratio": 0.0,
"completion_length": 464.71875,
"epoch": 1.5645833333333332,
"grad_norm": 1.193889856338501,
"kl": 0.254150390625,
"learning_rate": 4.784722222222223e-07,
"loss": 0.0003,
"reward": 1.71875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.71875,
"rewards/format_reward": 1.0,
"step": 751
},
{
"clip_ratio": 0.0,
"completion_length": 451.96875,
"epoch": 1.5666666666666667,
"grad_norm": 1.2460548877716064,
"kl": 0.25048828125,
"learning_rate": 4.777777777777778e-07,
"loss": 0.0003,
"reward": 0.9375,
"reward_std": 0.1157275140285492,
"rewards/accuracy_reward": 0.4375,
"rewards/format_reward": 0.5,
"step": 752
},
{
"clip_ratio": 0.0,
"completion_length": 510.15625,
"epoch": 1.56875,
"grad_norm": 1.2181428670883179,
"kl": 0.24365234375,
"learning_rate": 4.770833333333334e-07,
"loss": 0.0002,
"reward": 1.6875,
"reward_std": 0.1157275140285492,
"rewards/accuracy_reward": 0.6875,
"rewards/format_reward": 1.0,
"step": 753
},
{
"clip_ratio": 0.0,
"completion_length": 485.40625,
"epoch": 1.5708333333333333,
"grad_norm": 0.007452361285686493,
"kl": 0.2451171875,
"learning_rate": 4.7638888888888885e-07,
"loss": 0.0002,
"reward": 1.5,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.75,
"step": 754
},
{
"clip_ratio": 0.0,
"completion_length": 523.6875,
"epoch": 1.5729166666666665,
"grad_norm": 1.5803804397583008,
"kl": 0.23583984375,
"learning_rate": 4.756944444444444e-07,
"loss": 0.0002,
"reward": 1.4375,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.6875,
"rewards/format_reward": 0.75,
"step": 755
},
{
"clip_ratio": 0.0,
"completion_length": 499.4375,
"epoch": 1.575,
"grad_norm": 1.030873417854309,
"kl": 0.259765625,
"learning_rate": 4.7499999999999995e-07,
"loss": 0.0003,
"reward": 1.71875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.71875,
"rewards/format_reward": 1.0,
"step": 756
},
{
"clip_ratio": 0.0,
"completion_length": 498.875,
"epoch": 1.5770833333333334,
"grad_norm": 0.008705949410796165,
"kl": 0.248779296875,
"learning_rate": 4.743055555555555e-07,
"loss": 0.0002,
"reward": 1.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.75,
"step": 757
},
{
"clip_ratio": 0.0,
"completion_length": 474.09375,
"epoch": 1.5791666666666666,
"grad_norm": 1.3716362714767456,
"kl": 0.236328125,
"learning_rate": 4.7361111111111105e-07,
"loss": 0.0002,
"reward": 0.90625,
"reward_std": 0.2041158601641655,
"rewards/accuracy_reward": 0.1875,
"rewards/format_reward": 0.71875,
"step": 758
},
{
"clip_ratio": 0.0,
"completion_length": 477.0,
"epoch": 1.58125,
"grad_norm": 0.007176287472248077,
"kl": 0.23779296875,
"learning_rate": 4.7291666666666666e-07,
"loss": 0.0002,
"reward": 1.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.75,
"step": 759
},
{
"clip_ratio": 0.0,
"completion_length": 482.21875,
"epoch": 1.5833333333333335,
"grad_norm": 1.3382846117019653,
"kl": 0.238525390625,
"learning_rate": 4.722222222222222e-07,
"loss": 0.0002,
"reward": 1.9375,
"reward_std": 0.1157275140285492,
"rewards/accuracy_reward": 0.9375,
"rewards/format_reward": 1.0,
"step": 760
},
{
"clip_ratio": 0.0,
"completion_length": 522.5625,
"epoch": 1.5854166666666667,
"grad_norm": 0.007611650042235851,
"kl": 0.23583984375,
"learning_rate": 4.7152777777777776e-07,
"loss": 0.0002,
"reward": 1.5,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 1.0,
"step": 761
},
{
"clip_ratio": 0.0,
"completion_length": 567.78125,
"epoch": 1.5875,
"grad_norm": 0.4260139465332031,
"kl": 0.2158203125,
"learning_rate": 4.708333333333333e-07,
"loss": 0.0002,
"reward": 1.1875,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.46875,
"rewards/format_reward": 0.71875,
"step": 762
},
{
"clip_ratio": 0.0,
"completion_length": 502.65625,
"epoch": 1.5895833333333333,
"grad_norm": 0.011225441470742226,
"kl": 0.253173828125,
"learning_rate": 4.7013888888888886e-07,
"loss": 0.0003,
"reward": 1.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.5,
"step": 763
},
{
"clip_ratio": 0.0,
"completion_length": 500.5,
"epoch": 1.5916666666666668,
"grad_norm": 0.008185843005776405,
"kl": 0.244140625,
"learning_rate": 4.694444444444444e-07,
"loss": 0.0002,
"reward": 1.5,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.75,
"step": 764
},
{
"clip_ratio": 0.0,
"completion_length": 456.3125,
"epoch": 1.59375,
"grad_norm": 1.2148336172103882,
"kl": 0.236572265625,
"learning_rate": 4.6874999999999996e-07,
"loss": 0.0002,
"reward": 0.96875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.46875,
"rewards/format_reward": 0.5,
"step": 765
},
{
"clip_ratio": 0.0,
"completion_length": 485.75,
"epoch": 1.5958333333333332,
"grad_norm": 0.009695029817521572,
"kl": 0.255859375,
"learning_rate": 4.6805555555555556e-07,
"loss": 0.0003,
"reward": 1.75,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 1.0,
"step": 766
},
{
"clip_ratio": 0.0,
"completion_length": 477.15625,
"epoch": 1.5979166666666667,
"grad_norm": 1.706971287727356,
"kl": 0.252685546875,
"learning_rate": 4.673611111111111e-07,
"loss": 0.0003,
"reward": 1.40625,
"reward_std": 0.2041158601641655,
"rewards/accuracy_reward": 0.65625,
"rewards/format_reward": 0.75,
"step": 767
},
{
"clip_ratio": 0.0,
"completion_length": 520.9375,
"epoch": 1.6,
"grad_norm": 0.025292934849858284,
"kl": 0.259765625,
"learning_rate": 4.6666666666666666e-07,
"loss": 0.0003,
"reward": 1.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.5,
"step": 768
},
{
"clip_ratio": 0.0,
"completion_length": 490.34375,
"epoch": 1.6020833333333333,
"grad_norm": 1.5667407512664795,
"kl": 0.23681640625,
"learning_rate": 4.659722222222222e-07,
"loss": 0.0002,
"reward": 0.90625,
"reward_std": 0.1293872892856598,
"rewards/accuracy_reward": 0.40625,
"rewards/format_reward": 0.5,
"step": 769
},
{
"clip_ratio": 0.0,
"completion_length": 460.59375,
"epoch": 1.6041666666666665,
"grad_norm": 0.011945800855755806,
"kl": 0.24755859375,
"learning_rate": 4.6527777777777776e-07,
"loss": 0.0002,
"reward": 1.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.75,
"step": 770
},
{
"clip_ratio": 0.0,
"completion_length": 504.65625,
"epoch": 1.60625,
"grad_norm": 0.009756573475897312,
"kl": 0.246826171875,
"learning_rate": 4.645833333333333e-07,
"loss": 0.0002,
"reward": 1.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.75,
"step": 771
},
{
"clip_ratio": 0.0,
"completion_length": 472.53125,
"epoch": 1.6083333333333334,
"grad_norm": 4.669853687286377,
"kl": 0.27197265625,
"learning_rate": 4.6388888888888886e-07,
"loss": 0.0003,
"reward": 1.1875,
"reward_std": 0.1157275140285492,
"rewards/accuracy_reward": 0.4375,
"rewards/format_reward": 0.75,
"step": 772
},
{
"clip_ratio": 0.0,
"completion_length": 482.53125,
"epoch": 1.6104166666666666,
"grad_norm": 0.007559712044894695,
"kl": 0.246337890625,
"learning_rate": 4.6319444444444447e-07,
"loss": 0.0002,
"reward": 1.75,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 1.0,
"step": 773
},
{
"clip_ratio": 0.0,
"completion_length": 485.75,
"epoch": 1.6125,
"grad_norm": 0.01486685499548912,
"kl": 0.25146484375,
"learning_rate": 4.625e-07,
"loss": 0.0003,
"reward": 1.5,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.75,
"step": 774
},
{
"clip_ratio": 0.0,
"completion_length": 468.71875,
"epoch": 1.6145833333333335,
"grad_norm": 0.009248960763216019,
"kl": 0.2490234375,
"learning_rate": 4.6180555555555557e-07,
"loss": 0.0002,
"reward": 0.75,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.5,
"step": 775
},
{
"clip_ratio": 0.0,
"completion_length": 473.84375,
"epoch": 1.6166666666666667,
"grad_norm": 0.010325673967599869,
"kl": 0.248046875,
"learning_rate": 4.611111111111111e-07,
"loss": 0.0002,
"reward": 1.5,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.75,
"step": 776
},
{
"clip_ratio": 0.0,
"completion_length": 479.3125,
"epoch": 1.61875,
"grad_norm": 1.281834602355957,
"kl": 0.23974609375,
"learning_rate": 4.604166666666666e-07,
"loss": 0.0002,
"reward": 1.78125,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.78125,
"rewards/format_reward": 1.0,
"step": 777
},
{
"clip_ratio": 0.0,
"completion_length": 462.375,
"epoch": 1.6208333333333333,
"grad_norm": 1.2257119417190552,
"kl": 0.25634765625,
"learning_rate": 4.5972222222222217e-07,
"loss": 0.0003,
"reward": 0.46875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.21875,
"rewards/format_reward": 0.25,
"step": 778
},
{
"clip_ratio": 0.0,
"completion_length": 467.65625,
"epoch": 1.6229166666666668,
"grad_norm": 1.4265564680099487,
"kl": 0.245849609375,
"learning_rate": 4.590277777777777e-07,
"loss": 0.0002,
"reward": 1.21875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.46875,
"rewards/format_reward": 0.75,
"step": 779
},
{
"clip_ratio": 0.0,
"completion_length": 494.78125,
"epoch": 1.625,
"grad_norm": 0.008513258770108223,
"kl": 0.253662109375,
"learning_rate": 4.5833333333333327e-07,
"loss": 0.0003,
"reward": 0.5,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.25,
"step": 780
},
{
"clip_ratio": 0.0,
"completion_length": 476.125,
"epoch": 1.6270833333333332,
"grad_norm": 1.229500412940979,
"kl": 0.24609375,
"learning_rate": 4.5763888888888887e-07,
"loss": 0.0002,
"reward": 1.3125,
"reward_std": 0.1157275140285492,
"rewards/accuracy_reward": 0.5625,
"rewards/format_reward": 0.75,
"step": 781
},
{
"clip_ratio": 0.0,
"completion_length": 476.46875,
"epoch": 1.6291666666666667,
"grad_norm": 1.1059486865997314,
"kl": 0.248779296875,
"learning_rate": 4.569444444444444e-07,
"loss": 0.0002,
"reward": 1.21875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.46875,
"rewards/format_reward": 0.75,
"step": 782
},
{
"clip_ratio": 0.0,
"completion_length": 472.34375,
"epoch": 1.63125,
"grad_norm": 0.015273602679371834,
"kl": 0.265869140625,
"learning_rate": 4.5624999999999997e-07,
"loss": 0.0003,
"reward": 1.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.75,
"step": 783
},
{
"clip_ratio": 0.0,
"completion_length": 465.9375,
"epoch": 1.6333333333333333,
"grad_norm": 0.009410897269845009,
"kl": 0.261962890625,
"learning_rate": 4.555555555555555e-07,
"loss": 0.0003,
"reward": 1.5,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 1.0,
"step": 784
},
{
"clip_ratio": 0.0,
"completion_length": 498.96875,
"epoch": 1.6354166666666665,
"grad_norm": 1.280187726020813,
"kl": 0.245361328125,
"learning_rate": 4.548611111111111e-07,
"loss": 0.0002,
"reward": 0.46875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.21875,
"rewards/format_reward": 0.25,
"step": 785
},
{
"clip_ratio": 0.0,
"completion_length": 495.65625,
"epoch": 1.6375,
"grad_norm": 0.017177268862724304,
"kl": 0.235595703125,
"learning_rate": 4.541666666666666e-07,
"loss": 0.0002,
"reward": 0.75,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.5,
"step": 786
},
{
"clip_ratio": 0.0,
"completion_length": 437.84375,
"epoch": 1.6395833333333334,
"grad_norm": 0.008518899790942669,
"kl": 0.239501953125,
"learning_rate": 4.534722222222222e-07,
"loss": 0.0002,
"reward": 1.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.75,
"step": 787
},
{
"clip_ratio": 0.0,
"completion_length": 506.125,
"epoch": 1.6416666666666666,
"grad_norm": 0.008767511695623398,
"kl": 0.2353515625,
"learning_rate": 4.527777777777778e-07,
"loss": 0.0002,
"reward": 1.5,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.75,
"step": 788
},
{
"clip_ratio": 0.0,
"completion_length": 489.53125,
"epoch": 1.64375,
"grad_norm": 1.5510228872299194,
"kl": 0.24658203125,
"learning_rate": 4.5208333333333333e-07,
"loss": 0.0002,
"reward": 1.40625,
"reward_std": 0.2041158601641655,
"rewards/accuracy_reward": 0.65625,
"rewards/format_reward": 0.75,
"step": 789
},
{
"clip_ratio": 0.0,
"completion_length": 491.375,
"epoch": 1.6458333333333335,
"grad_norm": 2.2926583290100098,
"kl": 0.24755859375,
"learning_rate": 4.513888888888889e-07,
"loss": 0.0002,
"reward": 0.96875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.46875,
"rewards/format_reward": 0.5,
"step": 790
},
{
"clip_ratio": 0.0,
"completion_length": 537.40625,
"epoch": 1.6479166666666667,
"grad_norm": 0.01068432629108429,
"kl": 0.226806640625,
"learning_rate": 4.5069444444444443e-07,
"loss": 0.0002,
"reward": 1.75,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 1.0,
"step": 791
},
{
"clip_ratio": 0.0,
"completion_length": 480.5,
"epoch": 1.65,
"grad_norm": 0.007735088467597961,
"kl": 0.244873046875,
"learning_rate": 4.5e-07,
"loss": 0.0002,
"reward": 1.75,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 1.0,
"step": 792
},
{
"clip_ratio": 0.0,
"completion_length": 524.65625,
"epoch": 1.6520833333333333,
"grad_norm": 0.9811253547668457,
"kl": 0.242431640625,
"learning_rate": 4.4930555555555553e-07,
"loss": 0.0002,
"reward": 0.96875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.46875,
"rewards/format_reward": 0.5,
"step": 793
},
{
"clip_ratio": 0.0,
"completion_length": 508.4375,
"epoch": 1.6541666666666668,
"grad_norm": 1.101609468460083,
"kl": 0.236328125,
"learning_rate": 4.486111111111111e-07,
"loss": 0.0002,
"reward": 1.46875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.46875,
"rewards/format_reward": 1.0,
"step": 794
},
{
"clip_ratio": 0.0,
"completion_length": 544.1875,
"epoch": 1.65625,
"grad_norm": 10.281743049621582,
"kl": 0.244873046875,
"learning_rate": 4.479166666666667e-07,
"loss": 0.0002,
"reward": 1.6875,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.6875,
"rewards/format_reward": 1.0,
"step": 795
},
{
"clip_ratio": 0.0,
"completion_length": 530.8125,
"epoch": 1.6583333333333332,
"grad_norm": 1.3257184028625488,
"kl": 0.244384765625,
"learning_rate": 4.4722222222222223e-07,
"loss": 0.0002,
"reward": 1.21875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.46875,
"rewards/format_reward": 0.75,
"step": 796
},
{
"clip_ratio": 0.0,
"completion_length": 524.6875,
"epoch": 1.6604166666666667,
"grad_norm": 1.5066584348678589,
"kl": 0.243896484375,
"learning_rate": 4.465277777777778e-07,
"loss": 0.0002,
"reward": 1.5625,
"reward_std": 0.2177756354212761,
"rewards/accuracy_reward": 0.5625,
"rewards/format_reward": 1.0,
"step": 797
},
{
"clip_ratio": 0.0,
"completion_length": 523.4375,
"epoch": 1.6625,
"grad_norm": 0.010895702056586742,
"kl": 0.2431640625,
"learning_rate": 4.4583333333333334e-07,
"loss": 0.0002,
"reward": 1.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.5,
"step": 798
},
{
"clip_ratio": 0.0,
"completion_length": 500.21875,
"epoch": 1.6645833333333333,
"grad_norm": 0.007907292805612087,
"kl": 0.24853515625,
"learning_rate": 4.4513888888888883e-07,
"loss": 0.0002,
"reward": 1.5,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.75,
"step": 799
},
{
"clip_ratio": 0.0,
"completion_length": 558.1875,
"epoch": 1.6666666666666665,
"grad_norm": 1.0693315267562866,
"kl": 0.233154296875,
"learning_rate": 4.444444444444444e-07,
"loss": 0.0002,
"reward": 1.46875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.71875,
"rewards/format_reward": 0.75,
"step": 800
},
{
"clip_ratio": 0.0,
"completion_length": 538.3125,
"epoch": 1.66875,
"grad_norm": 0.028570212423801422,
"kl": 0.235107421875,
"learning_rate": 4.4374999999999993e-07,
"loss": 0.0002,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"step": 801
},
{
"clip_ratio": 0.0,
"completion_length": 580.0,
"epoch": 1.6708333333333334,
"grad_norm": 0.02017505094408989,
"kl": 0.254150390625,
"learning_rate": 4.4305555555555554e-07,
"loss": 0.0003,
"reward": 1.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.5,
"step": 802
},
{
"clip_ratio": 0.0,
"completion_length": 564.25,
"epoch": 1.6729166666666666,
"grad_norm": 3.2598605155944824,
"kl": 0.226806640625,
"learning_rate": 4.423611111111111e-07,
"loss": 0.0002,
"reward": 1.5,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 1.0,
"step": 803
},
{
"clip_ratio": 0.0,
"completion_length": 517.1875,
"epoch": 1.675,
"grad_norm": 1.6672528982162476,
"kl": 0.24658203125,
"learning_rate": 4.4166666666666664e-07,
"loss": 0.0002,
"reward": 1.65625,
"reward_std": 0.2651650384068489,
"rewards/accuracy_reward": 0.6875,
"rewards/format_reward": 0.96875,
"step": 804
},
{
"clip_ratio": 0.0,
"completion_length": 549.15625,
"epoch": 1.6770833333333335,
"grad_norm": 0.006843992974609137,
"kl": 0.23779296875,
"learning_rate": 4.409722222222222e-07,
"loss": 0.0002,
"reward": 1.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.75,
"step": 805
},
{
"clip_ratio": 0.0,
"completion_length": 590.25,
"epoch": 1.6791666666666667,
"grad_norm": 0.009703945368528366,
"kl": 0.2314453125,
"learning_rate": 4.4027777777777774e-07,
"loss": 0.0002,
"reward": 1.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.5,
"step": 806
},
{
"clip_ratio": 0.0,
"completion_length": 553.375,
"epoch": 1.68125,
"grad_norm": 0.9394215941429138,
"kl": 0.2373046875,
"learning_rate": 4.395833333333333e-07,
"loss": 0.0002,
"reward": 1.46875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.71875,
"rewards/format_reward": 0.75,
"step": 807
},
{
"clip_ratio": 0.0,
"completion_length": 565.5,
"epoch": 1.6833333333333333,
"grad_norm": 0.009763069450855255,
"kl": 0.237060546875,
"learning_rate": 4.3888888888888884e-07,
"loss": 0.0002,
"reward": 1.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.75,
"step": 808
},
{
"clip_ratio": 0.0,
"completion_length": 550.1875,
"epoch": 1.6854166666666668,
"grad_norm": 1.487508773803711,
"kl": 0.23876953125,
"learning_rate": 4.3819444444444444e-07,
"loss": 0.0002,
"reward": 0.5625,
"reward_std": 0.1157275140285492,
"rewards/accuracy_reward": 0.0625,
"rewards/format_reward": 0.5,
"step": 809
},
{
"clip_ratio": 0.0,
"completion_length": 575.03125,
"epoch": 1.6875,
"grad_norm": 1.174086332321167,
"kl": 0.220458984375,
"learning_rate": 4.375e-07,
"loss": 0.0002,
"reward": 0.46875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.21875,
"rewards/format_reward": 0.25,
"step": 810
},
{
"clip_ratio": 0.0,
"completion_length": 547.21875,
"epoch": 1.6895833333333332,
"grad_norm": 0.010807516053318977,
"kl": 0.236572265625,
"learning_rate": 4.3680555555555554e-07,
"loss": 0.0002,
"reward": 1.5,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 1.0,
"step": 811
},
{
"clip_ratio": 0.0,
"completion_length": 601.96875,
"epoch": 1.6916666666666667,
"grad_norm": 1.2976332902908325,
"kl": 0.2353515625,
"learning_rate": 4.361111111111111e-07,
"loss": 0.0002,
"reward": 0.78125,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.03125,
"rewards/format_reward": 0.75,
"step": 812
},
{
"clip_ratio": 0.0,
"completion_length": 587.5,
"epoch": 1.69375,
"grad_norm": 0.006790046114474535,
"kl": 0.226318359375,
"learning_rate": 4.3541666666666664e-07,
"loss": 0.0002,
"reward": 1.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.75,
"step": 813
},
{
"clip_ratio": 0.0,
"completion_length": 587.125,
"epoch": 1.6958333333333333,
"grad_norm": 0.9818464517593384,
"kl": 0.23876953125,
"learning_rate": 4.347222222222222e-07,
"loss": 0.0002,
"reward": 1.46875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.71875,
"step": 814
},
{
"clip_ratio": 0.0,
"completion_length": 567.78125,
"epoch": 1.6979166666666665,
"grad_norm": 0.008597256615757942,
"kl": 0.23388671875,
"learning_rate": 4.3402777777777775e-07,
"loss": 0.0002,
"reward": 1.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.75,
"step": 815
},
{
"clip_ratio": 0.0,
"completion_length": 594.125,
"epoch": 1.7,
"grad_norm": 0.008699624799191952,
"kl": 0.22705078125,
"learning_rate": 4.3333333333333335e-07,
"loss": 0.0002,
"reward": 1.75,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 1.0,
"step": 816
},
{
"clip_ratio": 0.0,
"completion_length": 572.375,
"epoch": 1.7020833333333334,
"grad_norm": 0.00947485025972128,
"kl": 0.25390625,
"learning_rate": 4.326388888888889e-07,
"loss": 0.0003,
"reward": 2.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 1.0,
"rewards/format_reward": 1.0,
"step": 817
},
{
"clip_ratio": 0.0,
"completion_length": 566.21875,
"epoch": 1.7041666666666666,
"grad_norm": 0.008000546135008335,
"kl": 0.236572265625,
"learning_rate": 4.3194444444444445e-07,
"loss": 0.0002,
"reward": 2.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 1.0,
"rewards/format_reward": 1.0,
"step": 818
},
{
"clip_ratio": 0.0,
"completion_length": 555.25,
"epoch": 1.70625,
"grad_norm": 1.0371057987213135,
"kl": 0.2333984375,
"learning_rate": 4.3125e-07,
"loss": 0.0002,
"reward": 1.28125,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.53125,
"rewards/format_reward": 0.75,
"step": 819
},
{
"clip_ratio": 0.0,
"completion_length": 588.8125,
"epoch": 1.7083333333333335,
"grad_norm": 0.834037184715271,
"kl": 0.23779296875,
"learning_rate": 4.3055555555555555e-07,
"loss": 0.0002,
"reward": 1.28125,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.53125,
"rewards/format_reward": 0.75,
"step": 820
},
{
"clip_ratio": 0.0,
"completion_length": 573.34375,
"epoch": 1.7104166666666667,
"grad_norm": 0.00836244411766529,
"kl": 0.239990234375,
"learning_rate": 4.298611111111111e-07,
"loss": 0.0002,
"reward": 1.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.75,
"step": 821
},
{
"clip_ratio": 0.0,
"completion_length": 562.40625,
"epoch": 1.7125,
"grad_norm": 0.00962614081799984,
"kl": 0.231689453125,
"learning_rate": 4.291666666666666e-07,
"loss": 0.0002,
"reward": 1.5,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.75,
"step": 822
},
{
"clip_ratio": 0.0,
"completion_length": 553.0625,
"epoch": 1.7145833333333333,
"grad_norm": 0.008219579234719276,
"kl": 0.23388671875,
"learning_rate": 4.284722222222222e-07,
"loss": 0.0002,
"reward": 1.5,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 1.0,
"step": 823
},
{
"clip_ratio": 0.0,
"completion_length": 554.125,
"epoch": 1.7166666666666668,
"grad_norm": 0.012971649877727032,
"kl": 0.24169921875,
"learning_rate": 4.2777777777777775e-07,
"loss": 0.0002,
"reward": 1.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.5,
"step": 824
},
{
"clip_ratio": 0.0,
"completion_length": 551.53125,
"epoch": 1.71875,
"grad_norm": 1.1302229166030884,
"kl": 0.2275390625,
"learning_rate": 4.270833333333333e-07,
"loss": 0.0002,
"reward": 1.71875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.71875,
"rewards/format_reward": 1.0,
"step": 825
},
{
"clip_ratio": 0.0,
"completion_length": 566.25,
"epoch": 1.7208333333333332,
"grad_norm": 0.010292713530361652,
"kl": 0.22119140625,
"learning_rate": 4.2638888888888885e-07,
"loss": 0.0002,
"reward": 1.5,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.75,
"step": 826
},
{
"clip_ratio": 0.0,
"completion_length": 549.5625,
"epoch": 1.7229166666666667,
"grad_norm": 1.4789761304855347,
"kl": 0.241943359375,
"learning_rate": 4.256944444444444e-07,
"loss": 0.0002,
"reward": 1.84375,
"reward_std": 0.22201896458864212,
"rewards/accuracy_reward": 0.84375,
"rewards/format_reward": 1.0,
"step": 827
},
{
"clip_ratio": 0.0,
"completion_length": 569.78125,
"epoch": 1.725,
"grad_norm": 1.0435210466384888,
"kl": 0.2333984375,
"learning_rate": 4.2499999999999995e-07,
"loss": 0.0002,
"reward": 0.78125,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.28125,
"rewards/format_reward": 0.5,
"step": 828
},
{
"clip_ratio": 0.0,
"completion_length": 575.15625,
"epoch": 1.7270833333333333,
"grad_norm": 0.012116172350943089,
"kl": 0.216064453125,
"learning_rate": 4.243055555555555e-07,
"loss": 0.0002,
"reward": 1.5,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.75,
"step": 829
},
{
"clip_ratio": 0.0,
"completion_length": 493.59375,
"epoch": 1.7291666666666665,
"grad_norm": 0.007295367773622274,
"kl": 0.2412109375,
"learning_rate": 4.236111111111111e-07,
"loss": 0.0002,
"reward": 1.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.5,
"step": 830
},
{
"clip_ratio": 0.0,
"completion_length": 549.15625,
"epoch": 1.73125,
"grad_norm": 0.9141552448272705,
"kl": 0.227783203125,
"learning_rate": 4.2291666666666666e-07,
"loss": 0.0002,
"reward": 0.46875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.21875,
"rewards/format_reward": 0.25,
"step": 831
},
{
"clip_ratio": 0.0,
"completion_length": 514.8125,
"epoch": 1.7333333333333334,
"grad_norm": 1.5482479333877563,
"kl": 0.240478515625,
"learning_rate": 4.222222222222222e-07,
"loss": 0.0002,
"reward": 1.9375,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.9375,
"rewards/format_reward": 1.0,
"step": 832
},
{
"clip_ratio": 0.0,
"completion_length": 543.96875,
"epoch": 1.7354166666666666,
"grad_norm": 0.013914127834141254,
"kl": 0.24267578125,
"learning_rate": 4.2152777777777776e-07,
"loss": 0.0002,
"reward": 1.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.75,
"step": 833
},
{
"clip_ratio": 0.0,
"completion_length": 515.0,
"epoch": 1.7375,
"grad_norm": 1.0192515850067139,
"kl": 0.22802734375,
"learning_rate": 4.208333333333333e-07,
"loss": 0.0002,
"reward": 1.6875,
"reward_std": 0.1157275140285492,
"rewards/accuracy_reward": 0.6875,
"rewards/format_reward": 1.0,
"step": 834
},
{
"clip_ratio": 0.0,
"completion_length": 564.8125,
"epoch": 1.7395833333333335,
"grad_norm": 0.00643956009298563,
"kl": 0.2177734375,
"learning_rate": 4.2013888888888886e-07,
"loss": 0.0002,
"reward": 2.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 1.0,
"rewards/format_reward": 1.0,
"step": 835
},
{
"clip_ratio": 0.0,
"completion_length": 508.90625,
"epoch": 1.7416666666666667,
"grad_norm": 0.008250262588262558,
"kl": 0.24462890625,
"learning_rate": 4.194444444444444e-07,
"loss": 0.0002,
"reward": 1.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.75,
"step": 836
},
{
"clip_ratio": 0.0,
"completion_length": 531.65625,
"epoch": 1.74375,
"grad_norm": 0.017665982246398926,
"kl": 0.261962890625,
"learning_rate": 4.1875e-07,
"loss": 0.0003,
"reward": 1.5,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.75,
"step": 837
},
{
"clip_ratio": 0.0,
"completion_length": 547.0,
"epoch": 1.7458333333333333,
"grad_norm": 0.01037755236029625,
"kl": 0.2333984375,
"learning_rate": 4.1805555555555556e-07,
"loss": 0.0002,
"reward": 1.5,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.75,
"step": 838
},
{
"clip_ratio": 0.0,
"completion_length": 534.09375,
"epoch": 1.7479166666666668,
"grad_norm": 1.4672600030899048,
"kl": 0.2333984375,
"learning_rate": 4.173611111111111e-07,
"loss": 0.0002,
"reward": 1.25,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.75,
"step": 839
},
{
"clip_ratio": 0.0,
"completion_length": 528.3125,
"epoch": 1.75,
"grad_norm": 1.229001522064209,
"kl": 0.202880859375,
"learning_rate": 4.1666666666666667e-07,
"loss": 0.0002,
"reward": 1.03125,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.28125,
"rewards/format_reward": 0.75,
"step": 840
},
{
"clip_ratio": 0.0,
"completion_length": 502.15625,
"epoch": 1.7520833333333332,
"grad_norm": 0.010309334844350815,
"kl": 0.255126953125,
"learning_rate": 4.159722222222222e-07,
"loss": 0.0003,
"reward": 1.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.5,
"step": 841
},
{
"clip_ratio": 0.0,
"completion_length": 558.59375,
"epoch": 1.7541666666666667,
"grad_norm": 0.0066090915352106094,
"kl": 0.232177734375,
"learning_rate": 4.1527777777777777e-07,
"loss": 0.0002,
"reward": 2.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 1.0,
"rewards/format_reward": 1.0,
"step": 842
},
{
"clip_ratio": 0.0,
"completion_length": 505.65625,
"epoch": 1.75625,
"grad_norm": 0.007594072259962559,
"kl": 0.236083984375,
"learning_rate": 4.145833333333333e-07,
"loss": 0.0002,
"reward": 2.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 1.0,
"rewards/format_reward": 1.0,
"step": 843
},
{
"clip_ratio": 0.0,
"completion_length": 506.71875,
"epoch": 1.7583333333333333,
"grad_norm": 1.4959443807601929,
"kl": 0.229248046875,
"learning_rate": 4.1388888888888887e-07,
"loss": 0.0002,
"reward": 0.78125,
"reward_std": 0.2041158601641655,
"rewards/accuracy_reward": 0.28125,
"rewards/format_reward": 0.5,
"step": 844
},
{
"clip_ratio": 0.0,
"completion_length": 509.3125,
"epoch": 1.7604166666666665,
"grad_norm": 1.0887670516967773,
"kl": 0.2236328125,
"learning_rate": 4.131944444444444e-07,
"loss": 0.0002,
"reward": 1.46875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.71875,
"step": 845
},
{
"clip_ratio": 0.0,
"completion_length": 519.4375,
"epoch": 1.7625,
"grad_norm": 1.3897308111190796,
"kl": 0.23974609375,
"learning_rate": 4.1249999999999997e-07,
"loss": 0.0002,
"reward": 1.46875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.71875,
"rewards/format_reward": 0.75,
"step": 846
},
{
"clip_ratio": 0.0,
"completion_length": 486.75,
"epoch": 1.7645833333333334,
"grad_norm": 0.009728828445076942,
"kl": 0.251953125,
"learning_rate": 4.118055555555555e-07,
"loss": 0.0003,
"reward": 1.5,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.75,
"step": 847
},
{
"clip_ratio": 0.0,
"completion_length": 522.875,
"epoch": 1.7666666666666666,
"grad_norm": 0.00988440215587616,
"kl": 0.2412109375,
"learning_rate": 4.1111111111111107e-07,
"loss": 0.0002,
"reward": 0.75,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.5,
"step": 848
},
{
"clip_ratio": 0.0,
"completion_length": 515.125,
"epoch": 1.76875,
"grad_norm": 0.9975308179855347,
"kl": 0.2294921875,
"learning_rate": 4.104166666666666e-07,
"loss": 0.0002,
"reward": 1.03125,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.28125,
"rewards/format_reward": 0.75,
"step": 849
},
{
"clip_ratio": 0.0,
"completion_length": 518.65625,
"epoch": 1.7708333333333335,
"grad_norm": 0.007184633985161781,
"kl": 0.22998046875,
"learning_rate": 4.0972222222222217e-07,
"loss": 0.0002,
"reward": 1.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.75,
"step": 850
},
{
"clip_ratio": 0.0,
"completion_length": 519.5,
"epoch": 1.7729166666666667,
"grad_norm": 1.1164164543151855,
"kl": 0.232421875,
"learning_rate": 4.0902777777777777e-07,
"loss": 0.0002,
"reward": 1.875,
"reward_std": 0.13363061845302582,
"rewards/accuracy_reward": 0.875,
"rewards/format_reward": 1.0,
"step": 851
},
{
"clip_ratio": 0.0,
"completion_length": 507.84375,
"epoch": 1.775,
"grad_norm": 0.007584977429360151,
"kl": 0.233642578125,
"learning_rate": 4.083333333333333e-07,
"loss": 0.0002,
"reward": 1.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.5,
"step": 852
},
{
"clip_ratio": 0.0,
"completion_length": 562.03125,
"epoch": 1.7770833333333333,
"grad_norm": 0.007155933883041143,
"kl": 0.22705078125,
"learning_rate": 4.076388888888889e-07,
"loss": 0.0002,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"step": 853
},
{
"clip_ratio": 0.0,
"completion_length": 523.09375,
"epoch": 1.7791666666666668,
"grad_norm": 0.007546697277575731,
"kl": 0.22900390625,
"learning_rate": 4.069444444444444e-07,
"loss": 0.0002,
"reward": 1.5,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.75,
"step": 854
},
{
"clip_ratio": 0.0,
"completion_length": 505.09375,
"epoch": 1.78125,
"grad_norm": 1.150571584701538,
"kl": 0.22412109375,
"learning_rate": 4.0625e-07,
"loss": 0.0002,
"reward": 1.4375,
"reward_std": 0.1157275140285492,
"rewards/accuracy_reward": 0.6875,
"rewards/format_reward": 0.75,
"step": 855
},
{
"clip_ratio": 0.0,
"completion_length": 555.40625,
"epoch": 1.7833333333333332,
"grad_norm": 1.047773838043213,
"kl": 0.239990234375,
"learning_rate": 4.055555555555555e-07,
"loss": 0.0002,
"reward": 1.4375,
"reward_std": 0.1157275140285492,
"rewards/accuracy_reward": 0.6875,
"rewards/format_reward": 0.75,
"step": 856
},
{
"clip_ratio": 0.0,
"completion_length": 509.46875,
"epoch": 1.7854166666666667,
"grad_norm": 1.4127116203308105,
"kl": 0.240966796875,
"learning_rate": 4.048611111111111e-07,
"loss": 0.0002,
"reward": 1.46875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.46875,
"rewards/format_reward": 1.0,
"step": 857
},
{
"clip_ratio": 0.0,
"completion_length": 543.9375,
"epoch": 1.7875,
"grad_norm": 0.8850436806678772,
"kl": 0.256103515625,
"learning_rate": 4.041666666666667e-07,
"loss": 0.0003,
"reward": 1.21875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.46875,
"rewards/format_reward": 0.75,
"step": 858
},
{
"clip_ratio": 0.0,
"completion_length": 499.8125,
"epoch": 1.7895833333333333,
"grad_norm": 0.007401830516755581,
"kl": 0.222900390625,
"learning_rate": 4.0347222222222223e-07,
"loss": 0.0002,
"reward": 1.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.5,
"step": 859
},
{
"clip_ratio": 0.0,
"completion_length": 535.75,
"epoch": 1.7916666666666665,
"grad_norm": 1.490172266960144,
"kl": 0.234619140625,
"learning_rate": 4.027777777777778e-07,
"loss": 0.0002,
"reward": 1.03125,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.28125,
"rewards/format_reward": 0.75,
"step": 860
},
{
"clip_ratio": 0.0,
"completion_length": 548.375,
"epoch": 1.79375,
"grad_norm": 3.965214490890503,
"kl": 0.228271484375,
"learning_rate": 4.0208333333333333e-07,
"loss": 0.0002,
"reward": 1.46875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.71875,
"rewards/format_reward": 0.75,
"step": 861
},
{
"clip_ratio": 0.0,
"completion_length": 505.21875,
"epoch": 1.7958333333333334,
"grad_norm": 0.00795065052807331,
"kl": 0.24658203125,
"learning_rate": 4.013888888888889e-07,
"loss": 0.0002,
"reward": 1.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.75,
"step": 862
},
{
"clip_ratio": 0.0,
"completion_length": 526.40625,
"epoch": 1.7979166666666666,
"grad_norm": 0.01374655682593584,
"kl": 0.212646484375,
"learning_rate": 4.0069444444444443e-07,
"loss": 0.0002,
"reward": 0.75,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.75,
"step": 863
},
{
"clip_ratio": 0.0,
"completion_length": 533.15625,
"epoch": 1.8,
"grad_norm": 0.008126798085868359,
"kl": 0.227783203125,
"learning_rate": 4e-07,
"loss": 0.0002,
"reward": 1.5,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.75,
"step": 864
},
{
"clip_ratio": 0.0,
"completion_length": 531.6875,
"epoch": 1.8020833333333335,
"grad_norm": 0.007166092284023762,
"kl": 0.231689453125,
"learning_rate": 3.993055555555556e-07,
"loss": 0.0002,
"reward": 1.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.5,
"step": 865
},
{
"clip_ratio": 0.0,
"completion_length": 550.3125,
"epoch": 1.8041666666666667,
"grad_norm": 1.0180275440216064,
"kl": 0.22607421875,
"learning_rate": 3.9861111111111114e-07,
"loss": 0.0002,
"reward": 1.46875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.71875,
"rewards/format_reward": 0.75,
"step": 866
},
{
"clip_ratio": 0.0,
"completion_length": 545.71875,
"epoch": 1.80625,
"grad_norm": 0.012762402184307575,
"kl": 0.242431640625,
"learning_rate": 3.9791666666666663e-07,
"loss": 0.0002,
"reward": 0.5,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.25,
"step": 867
},
{
"clip_ratio": 0.0,
"completion_length": 509.65625,
"epoch": 1.8083333333333333,
"grad_norm": 1.4062516689300537,
"kl": 0.232421875,
"learning_rate": 3.972222222222222e-07,
"loss": 0.0002,
"reward": 1.9375,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.9375,
"rewards/format_reward": 1.0,
"step": 868
},
{
"clip_ratio": 0.0,
"completion_length": 519.0,
"epoch": 1.8104166666666668,
"grad_norm": 1.635831356048584,
"kl": 0.243896484375,
"learning_rate": 3.9652777777777773e-07,
"loss": 0.0002,
"reward": 1.9375,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.9375,
"rewards/format_reward": 1.0,
"step": 869
},
{
"clip_ratio": 0.0,
"completion_length": 546.5625,
"epoch": 1.8125,
"grad_norm": 0.007981624454259872,
"kl": 0.21728515625,
"learning_rate": 3.958333333333333e-07,
"loss": 0.0002,
"reward": 1.75,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 1.0,
"step": 870
},
{
"clip_ratio": 0.0,
"completion_length": 547.34375,
"epoch": 1.8145833333333332,
"grad_norm": 2.526054620742798,
"kl": 0.22802734375,
"learning_rate": 3.9513888888888883e-07,
"loss": 0.0002,
"reward": 1.3125,
"reward_std": 0.1157275140285492,
"rewards/accuracy_reward": 0.5625,
"rewards/format_reward": 0.75,
"step": 871
},
{
"clip_ratio": 0.0,
"completion_length": 548.3125,
"epoch": 1.8166666666666667,
"grad_norm": 1.5642163753509521,
"kl": 0.2314453125,
"learning_rate": 3.9444444444444444e-07,
"loss": 0.0002,
"reward": 1.4375,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.6875,
"rewards/format_reward": 0.75,
"step": 872
},
{
"clip_ratio": 0.0,
"completion_length": 528.625,
"epoch": 1.81875,
"grad_norm": 0.007630742155015469,
"kl": 0.227783203125,
"learning_rate": 3.9375e-07,
"loss": 0.0002,
"reward": 1.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.5,
"step": 873
},
{
"clip_ratio": 0.0,
"completion_length": 561.75,
"epoch": 1.8208333333333333,
"grad_norm": 0.0069867093116045,
"kl": 0.230712890625,
"learning_rate": 3.9305555555555554e-07,
"loss": 0.0002,
"reward": 1.5,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.75,
"step": 874
},
{
"clip_ratio": 0.0,
"completion_length": 547.40625,
"epoch": 1.8229166666666665,
"grad_norm": 0.02551759034395218,
"kl": 0.217041015625,
"learning_rate": 3.923611111111111e-07,
"loss": 0.0002,
"reward": 1.5,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.75,
"step": 875
},
{
"clip_ratio": 0.0,
"completion_length": 514.4375,
"epoch": 1.825,
"grad_norm": 0.012089493684470654,
"kl": 0.23583984375,
"learning_rate": 3.9166666666666664e-07,
"loss": 0.0002,
"reward": 2.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 1.0,
"rewards/format_reward": 1.0,
"step": 876
},
{
"clip_ratio": 0.0,
"completion_length": 526.3125,
"epoch": 1.8270833333333334,
"grad_norm": 0.03711254894733429,
"kl": 0.2236328125,
"learning_rate": 3.909722222222222e-07,
"loss": 0.0002,
"reward": 1.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.5,
"step": 877
},
{
"clip_ratio": 0.0,
"completion_length": 501.5625,
"epoch": 1.8291666666666666,
"grad_norm": 0.008239555172622204,
"kl": 0.24169921875,
"learning_rate": 3.9027777777777774e-07,
"loss": 0.0002,
"reward": 1.5,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.75,
"step": 878
},
{
"clip_ratio": 0.0,
"completion_length": 556.0,
"epoch": 1.83125,
"grad_norm": 0.007976679131388664,
"kl": 0.235107421875,
"learning_rate": 3.8958333333333334e-07,
"loss": 0.0002,
"reward": 1.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.75,
"step": 879
},
{
"clip_ratio": 0.0,
"completion_length": 502.875,
"epoch": 1.8333333333333335,
"grad_norm": 0.009058798663318157,
"kl": 0.248046875,
"learning_rate": 3.888888888888889e-07,
"loss": 0.0002,
"reward": 0.75,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.5,
"step": 880
},
{
"clip_ratio": 0.0,
"completion_length": 508.4375,
"epoch": 1.8354166666666667,
"grad_norm": 0.007584620267152786,
"kl": 0.22900390625,
"learning_rate": 3.8819444444444445e-07,
"loss": 0.0002,
"reward": 1.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.5,
"step": 881
},
{
"clip_ratio": 0.0,
"completion_length": 544.65625,
"epoch": 1.8375,
"grad_norm": 0.007870173081755638,
"kl": 0.219970703125,
"learning_rate": 3.875e-07,
"loss": 0.0002,
"reward": 1.5,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.75,
"step": 882
},
{
"clip_ratio": 0.0,
"completion_length": 520.96875,
"epoch": 1.8395833333333333,
"grad_norm": 1.4032890796661377,
"kl": 0.2431640625,
"learning_rate": 3.8680555555555555e-07,
"loss": 0.0002,
"reward": 1.71875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.71875,
"rewards/format_reward": 1.0,
"step": 883
},
{
"clip_ratio": 0.0,
"completion_length": 553.78125,
"epoch": 1.8416666666666668,
"grad_norm": 1.04135000705719,
"kl": 0.256103515625,
"learning_rate": 3.861111111111111e-07,
"loss": 0.0003,
"reward": 1.1875,
"reward_std": 0.1157275140285492,
"rewards/accuracy_reward": 0.4375,
"rewards/format_reward": 0.75,
"step": 884
},
{
"clip_ratio": 0.0,
"completion_length": 508.3125,
"epoch": 1.84375,
"grad_norm": 0.006909274961799383,
"kl": 0.230224609375,
"learning_rate": 3.8541666666666665e-07,
"loss": 0.0002,
"reward": 0.75,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.5,
"step": 885
},
{
"clip_ratio": 0.0,
"completion_length": 550.96875,
"epoch": 1.8458333333333332,
"grad_norm": 0.006952146999537945,
"kl": 0.222900390625,
"learning_rate": 3.8472222222222225e-07,
"loss": 0.0002,
"reward": 1.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.5,
"step": 886
},
{
"clip_ratio": 0.0,
"completion_length": 540.875,
"epoch": 1.8479166666666667,
"grad_norm": 1.0288861989974976,
"kl": 0.226318359375,
"learning_rate": 3.840277777777778e-07,
"loss": 0.0002,
"reward": 0.96875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.46875,
"rewards/format_reward": 0.5,
"step": 887
},
{
"clip_ratio": 0.0,
"completion_length": 531.28125,
"epoch": 1.85,
"grad_norm": 0.007326650433242321,
"kl": 0.228515625,
"learning_rate": 3.8333333333333335e-07,
"loss": 0.0002,
"reward": 1.5,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.75,
"step": 888
},
{
"clip_ratio": 0.0,
"completion_length": 539.40625,
"epoch": 1.8520833333333333,
"grad_norm": 0.015359265729784966,
"kl": 0.226318359375,
"learning_rate": 3.8263888888888885e-07,
"loss": 0.0002,
"reward": 2.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 1.0,
"rewards/format_reward": 1.0,
"step": 889
},
{
"clip_ratio": 0.0,
"completion_length": 511.40625,
"epoch": 1.8541666666666665,
"grad_norm": 5.617523193359375,
"kl": 0.23681640625,
"learning_rate": 3.819444444444444e-07,
"loss": 0.0002,
"reward": 1.875,
"reward_std": 0.2925042062997818,
"rewards/accuracy_reward": 0.875,
"rewards/format_reward": 1.0,
"step": 890
},
{
"clip_ratio": 0.0,
"completion_length": 551.21875,
"epoch": 1.85625,
"grad_norm": 1.0431323051452637,
"kl": 0.235595703125,
"learning_rate": 3.8124999999999995e-07,
"loss": 0.0002,
"reward": 0.71875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.21875,
"rewards/format_reward": 0.5,
"step": 891
},
{
"clip_ratio": 0.0,
"completion_length": 514.46875,
"epoch": 1.8583333333333334,
"grad_norm": 1.8033677339553833,
"kl": 0.232421875,
"learning_rate": 3.805555555555555e-07,
"loss": 0.0002,
"reward": 1.28125,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.28125,
"rewards/format_reward": 1.0,
"step": 892
},
{
"clip_ratio": 0.0,
"completion_length": 508.65625,
"epoch": 1.8604166666666666,
"grad_norm": 0.006438506301492453,
"kl": 0.214111328125,
"learning_rate": 3.7986111111111105e-07,
"loss": 0.0002,
"reward": 1.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.5,
"step": 893
},
{
"clip_ratio": 0.0,
"completion_length": 541.71875,
"epoch": 1.8625,
"grad_norm": 0.9842865467071533,
"kl": 0.224365234375,
"learning_rate": 3.7916666666666665e-07,
"loss": 0.0002,
"reward": 1.34375,
"reward_std": 0.1293872892856598,
"rewards/accuracy_reward": 0.59375,
"rewards/format_reward": 0.75,
"step": 894
},
{
"clip_ratio": 0.0,
"completion_length": 551.65625,
"epoch": 1.8645833333333335,
"grad_norm": 0.007584480568766594,
"kl": 0.2353515625,
"learning_rate": 3.784722222222222e-07,
"loss": 0.0002,
"reward": 0.75,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.5,
"step": 895
},
{
"clip_ratio": 0.0,
"completion_length": 571.71875,
"epoch": 1.8666666666666667,
"grad_norm": 0.8712323904037476,
"kl": 0.220703125,
"learning_rate": 3.7777777777777775e-07,
"loss": 0.0002,
"reward": 1.46875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.46875,
"rewards/format_reward": 1.0,
"step": 896
},
{
"clip_ratio": 0.0,
"completion_length": 539.375,
"epoch": 1.86875,
"grad_norm": 0.014179886318743229,
"kl": 0.240234375,
"learning_rate": 3.770833333333333e-07,
"loss": 0.0002,
"reward": 1.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.75,
"step": 897
},
{
"clip_ratio": 0.0,
"completion_length": 520.78125,
"epoch": 1.8708333333333333,
"grad_norm": 1.2494723796844482,
"kl": 0.2431640625,
"learning_rate": 3.7638888888888886e-07,
"loss": 0.0002,
"reward": 0.9375,
"reward_std": 0.1157275140285492,
"rewards/accuracy_reward": 0.4375,
"rewards/format_reward": 0.5,
"step": 898
},
{
"clip_ratio": 0.0,
"completion_length": 540.75,
"epoch": 1.8729166666666668,
"grad_norm": 0.007599582429975271,
"kl": 0.221923828125,
"learning_rate": 3.756944444444444e-07,
"loss": 0.0002,
"reward": 1.5,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.75,
"step": 899
},
{
"clip_ratio": 0.0,
"completion_length": 531.375,
"epoch": 1.875,
"grad_norm": 1.021194338798523,
"kl": 0.239013671875,
"learning_rate": 3.75e-07,
"loss": 0.0002,
"reward": 1.71875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.71875,
"rewards/format_reward": 1.0,
"step": 900
},
{
"clip_ratio": 0.0,
"completion_length": 511.53125,
"epoch": 1.8770833333333332,
"grad_norm": 1.638185977935791,
"kl": 0.22607421875,
"learning_rate": 3.7430555555555556e-07,
"loss": 0.0002,
"reward": 1.3125,
"reward_std": 0.1157275140285492,
"rewards/accuracy_reward": 0.5625,
"rewards/format_reward": 0.75,
"step": 901
},
{
"clip_ratio": 0.0,
"completion_length": 482.1875,
"epoch": 1.8791666666666667,
"grad_norm": 1.0922802686691284,
"kl": 0.268310546875,
"learning_rate": 3.736111111111111e-07,
"loss": 0.0003,
"reward": 1.46875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.71875,
"step": 902
},
{
"clip_ratio": 0.0,
"completion_length": 500.375,
"epoch": 1.88125,
"grad_norm": 0.011066813953220844,
"kl": 0.23828125,
"learning_rate": 3.7291666666666666e-07,
"loss": 0.0002,
"reward": 1.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.75,
"step": 903
},
{
"clip_ratio": 0.0,
"completion_length": 481.46875,
"epoch": 1.8833333333333333,
"grad_norm": 0.00842176005244255,
"kl": 0.2294921875,
"learning_rate": 3.722222222222222e-07,
"loss": 0.0002,
"reward": 1.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.75,
"step": 904
},
{
"clip_ratio": 0.0,
"completion_length": 509.0,
"epoch": 1.8854166666666665,
"grad_norm": 0.007877349853515625,
"kl": 0.237548828125,
"learning_rate": 3.7152777777777776e-07,
"loss": 0.0002,
"reward": 2.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 1.0,
"rewards/format_reward": 1.0,
"step": 905
},
{
"clip_ratio": 0.0,
"completion_length": 537.78125,
"epoch": 1.8875,
"grad_norm": 1.380505084991455,
"kl": 0.22900390625,
"learning_rate": 3.708333333333333e-07,
"loss": 0.0002,
"reward": 0.4375,
"reward_std": 0.1157275140285492,
"rewards/accuracy_reward": 0.1875,
"rewards/format_reward": 0.25,
"step": 906
},
{
"clip_ratio": 0.0,
"completion_length": 486.90625,
"epoch": 1.8895833333333334,
"grad_norm": 0.9170504808425903,
"kl": 0.244873046875,
"learning_rate": 3.701388888888889e-07,
"loss": 0.0002,
"reward": 1.46875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.71875,
"rewards/format_reward": 0.75,
"step": 907
},
{
"clip_ratio": 0.0,
"completion_length": 494.3125,
"epoch": 1.8916666666666666,
"grad_norm": 0.008912342600524426,
"kl": 0.23681640625,
"learning_rate": 3.6944444444444447e-07,
"loss": 0.0002,
"reward": 1.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.5,
"step": 908
},
{
"clip_ratio": 0.0,
"completion_length": 484.75,
"epoch": 1.89375,
"grad_norm": 1.8576105833053589,
"kl": 0.234375,
"learning_rate": 3.6875e-07,
"loss": 0.0002,
"reward": 1.0,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.75,
"step": 909
},
{
"clip_ratio": 0.0,
"completion_length": 487.9375,
"epoch": 1.8958333333333335,
"grad_norm": 0.006501065567135811,
"kl": 0.225830078125,
"learning_rate": 3.6805555555555557e-07,
"loss": 0.0002,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"step": 910
},
{
"clip_ratio": 0.0,
"completion_length": 509.4375,
"epoch": 1.8979166666666667,
"grad_norm": 1.175131916999817,
"kl": 0.23388671875,
"learning_rate": 3.673611111111111e-07,
"loss": 0.0002,
"reward": 0.53125,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.03125,
"rewards/format_reward": 0.5,
"step": 911
},
{
"clip_ratio": 0.0,
"completion_length": 508.4375,
"epoch": 1.9,
"grad_norm": 0.008509078063070774,
"kl": 0.252685546875,
"learning_rate": 3.666666666666666e-07,
"loss": 0.0003,
"reward": 1.5,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.75,
"step": 912
},
{
"clip_ratio": 0.0,
"completion_length": 476.5625,
"epoch": 1.9020833333333333,
"grad_norm": 1.2172304391860962,
"kl": 0.23876953125,
"learning_rate": 3.6597222222222217e-07,
"loss": 0.0002,
"reward": 1.15625,
"reward_std": 0.1293872892856598,
"rewards/accuracy_reward": 0.40625,
"rewards/format_reward": 0.75,
"step": 913
},
{
"clip_ratio": 0.0,
"completion_length": 485.21875,
"epoch": 1.9041666666666668,
"grad_norm": 0.006972001399844885,
"kl": 0.22998046875,
"learning_rate": 3.652777777777777e-07,
"loss": 0.0002,
"reward": 1.75,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 1.0,
"step": 914
},
{
"clip_ratio": 0.0,
"completion_length": 492.84375,
"epoch": 1.90625,
"grad_norm": 1.4839038848876953,
"kl": 0.235595703125,
"learning_rate": 3.645833333333333e-07,
"loss": 0.0002,
"reward": 1.46875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.71875,
"rewards/format_reward": 0.75,
"step": 915
},
{
"clip_ratio": 0.0,
"completion_length": 509.9375,
"epoch": 1.9083333333333332,
"grad_norm": 0.011867961846292019,
"kl": 0.24609375,
"learning_rate": 3.6388888888888887e-07,
"loss": 0.0002,
"reward": 1.5,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.75,
"step": 916
},
{
"clip_ratio": 0.0,
"completion_length": 492.71875,
"epoch": 1.9104166666666667,
"grad_norm": 1.2053366899490356,
"kl": 0.2373046875,
"learning_rate": 3.631944444444444e-07,
"loss": 0.0002,
"reward": 1.96875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.96875,
"rewards/format_reward": 1.0,
"step": 917
},
{
"clip_ratio": 0.0,
"completion_length": 477.03125,
"epoch": 1.9125,
"grad_norm": 0.0455983504652977,
"kl": 0.222412109375,
"learning_rate": 3.6249999999999997e-07,
"loss": 0.0002,
"reward": 1.5,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.75,
"step": 918
},
{
"clip_ratio": 0.0,
"completion_length": 495.4375,
"epoch": 1.9145833333333333,
"grad_norm": 1.0417066812515259,
"kl": 0.245361328125,
"learning_rate": 3.618055555555555e-07,
"loss": 0.0002,
"reward": 1.46875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.71875,
"rewards/format_reward": 0.75,
"step": 919
},
{
"clip_ratio": 0.0,
"completion_length": 487.75,
"epoch": 1.9166666666666665,
"grad_norm": 0.0077906264923512936,
"kl": 0.23583984375,
"learning_rate": 3.6111111111111107e-07,
"loss": 0.0002,
"reward": 1.5,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.75,
"step": 920
},
{
"clip_ratio": 0.0,
"completion_length": 469.28125,
"epoch": 1.91875,
"grad_norm": 0.00771137373521924,
"kl": 0.238525390625,
"learning_rate": 3.604166666666666e-07,
"loss": 0.0002,
"reward": 1.5,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.75,
"step": 921
},
{
"clip_ratio": 0.0,
"completion_length": 432.125,
"epoch": 1.9208333333333334,
"grad_norm": 2.136435031890869,
"kl": 0.256103515625,
"learning_rate": 3.597222222222222e-07,
"loss": 0.0003,
"reward": 0.78125,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.03125,
"rewards/format_reward": 0.75,
"step": 922
},
{
"clip_ratio": 0.0,
"completion_length": 452.0,
"epoch": 1.9229166666666666,
"grad_norm": 0.006540779490023851,
"kl": 0.22509765625,
"learning_rate": 3.590277777777778e-07,
"loss": 0.0002,
"reward": 1.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.75,
"step": 923
},
{
"clip_ratio": 0.0,
"completion_length": 451.8125,
"epoch": 1.925,
"grad_norm": 1.0841056108474731,
"kl": 0.244873046875,
"learning_rate": 3.583333333333333e-07,
"loss": 0.0002,
"reward": 1.1875,
"reward_std": 0.1157275140285492,
"rewards/accuracy_reward": 0.46875,
"rewards/format_reward": 0.71875,
"step": 924
},
{
"clip_ratio": 0.0,
"completion_length": 460.5,
"epoch": 1.9270833333333335,
"grad_norm": 0.008191017434000969,
"kl": 0.240478515625,
"learning_rate": 3.576388888888889e-07,
"loss": 0.0002,
"reward": 1.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.5,
"step": 925
},
{
"clip_ratio": 0.0,
"completion_length": 465.875,
"epoch": 1.9291666666666667,
"grad_norm": 0.007460338994860649,
"kl": 0.2529296875,
"learning_rate": 3.5694444444444443e-07,
"loss": 0.0003,
"reward": 1.5,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.75,
"step": 926
},
{
"clip_ratio": 0.0,
"completion_length": 465.4375,
"epoch": 1.93125,
"grad_norm": 0.014311104081571102,
"kl": 0.2529296875,
"learning_rate": 3.5625e-07,
"loss": 0.0003,
"reward": 1.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.5,
"step": 927
},
{
"clip_ratio": 0.0,
"completion_length": 445.125,
"epoch": 1.9333333333333333,
"grad_norm": 1.2025405168533325,
"kl": 0.250244140625,
"learning_rate": 3.5555555555555553e-07,
"loss": 0.0002,
"reward": 0.96875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.46875,
"rewards/format_reward": 0.5,
"step": 928
},
{
"clip_ratio": 0.0,
"completion_length": 428.5,
"epoch": 1.9354166666666668,
"grad_norm": 0.007025923114269972,
"kl": 0.23974609375,
"learning_rate": 3.5486111111111113e-07,
"loss": 0.0002,
"reward": 2.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 1.0,
"rewards/format_reward": 1.0,
"step": 929
},
{
"clip_ratio": 0.0,
"completion_length": 449.0625,
"epoch": 1.9375,
"grad_norm": 0.008093161508440971,
"kl": 0.2373046875,
"learning_rate": 3.541666666666667e-07,
"loss": 0.0002,
"reward": 2.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 1.0,
"rewards/format_reward": 1.0,
"step": 930
},
{
"clip_ratio": 0.0,
"completion_length": 440.9375,
"epoch": 1.9395833333333332,
"grad_norm": 0.013698437251150608,
"kl": 0.2470703125,
"learning_rate": 3.5347222222222223e-07,
"loss": 0.0002,
"reward": 1.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.5,
"step": 931
},
{
"clip_ratio": 0.0,
"completion_length": 478.96875,
"epoch": 1.9416666666666667,
"grad_norm": 0.00795261561870575,
"kl": 0.240966796875,
"learning_rate": 3.527777777777778e-07,
"loss": 0.0002,
"reward": 1.5,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 1.0,
"step": 932
},
{
"clip_ratio": 0.0,
"completion_length": 430.5625,
"epoch": 1.94375,
"grad_norm": 0.008631990291178226,
"kl": 0.239990234375,
"learning_rate": 3.5208333333333333e-07,
"loss": 0.0002,
"reward": 1.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.75,
"step": 933
},
{
"clip_ratio": 0.0,
"completion_length": 481.09375,
"epoch": 1.9458333333333333,
"grad_norm": 0.008530229330062866,
"kl": 0.252197265625,
"learning_rate": 3.5138888888888883e-07,
"loss": 0.0003,
"reward": 1.5,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.75,
"step": 934
},
{
"clip_ratio": 0.0,
"completion_length": 447.09375,
"epoch": 1.9479166666666665,
"grad_norm": 0.010614525526762009,
"kl": 0.254150390625,
"learning_rate": 3.506944444444444e-07,
"loss": 0.0003,
"reward": 1.5,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.75,
"step": 935
},
{
"clip_ratio": 0.0,
"completion_length": 437.78125,
"epoch": 1.95,
"grad_norm": 1.1804996728897095,
"kl": 0.243408203125,
"learning_rate": 3.5e-07,
"loss": 0.0002,
"reward": 1.71875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.71875,
"rewards/format_reward": 1.0,
"step": 936
},
{
"clip_ratio": 0.0,
"completion_length": 435.0625,
"epoch": 1.9520833333333334,
"grad_norm": 1.2165803909301758,
"kl": 0.244384765625,
"learning_rate": 3.4930555555555553e-07,
"loss": 0.0002,
"reward": 0.78125,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.28125,
"rewards/format_reward": 0.5,
"step": 937
},
{
"clip_ratio": 0.0,
"completion_length": 465.53125,
"epoch": 1.9541666666666666,
"grad_norm": 1.3811777830123901,
"kl": 0.258056640625,
"learning_rate": 3.486111111111111e-07,
"loss": 0.0003,
"reward": 1.5,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 1.0,
"step": 938
},
{
"clip_ratio": 0.0,
"completion_length": 463.96875,
"epoch": 1.95625,
"grad_norm": 0.007179384585469961,
"kl": 0.237060546875,
"learning_rate": 3.4791666666666664e-07,
"loss": 0.0002,
"reward": 1.5,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.75,
"step": 939
},
{
"clip_ratio": 0.0,
"completion_length": 416.0625,
"epoch": 1.9583333333333335,
"grad_norm": 1.3711605072021484,
"kl": 0.3095703125,
"learning_rate": 3.472222222222222e-07,
"loss": 0.0003,
"reward": 1.09375,
"reward_std": 0.1293872892856598,
"rewards/accuracy_reward": 0.34375,
"rewards/format_reward": 0.75,
"step": 940
},
{
"clip_ratio": 0.0,
"completion_length": 441.84375,
"epoch": 1.9604166666666667,
"grad_norm": 0.016370367258787155,
"kl": 0.2802734375,
"learning_rate": 3.4652777777777774e-07,
"loss": 0.0003,
"reward": 1.75,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 1.0,
"step": 941
},
{
"clip_ratio": 0.0,
"completion_length": 427.09375,
"epoch": 1.9625,
"grad_norm": 1.2782434225082397,
"kl": 0.275390625,
"learning_rate": 3.458333333333333e-07,
"loss": 0.0003,
"reward": 1.96875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.96875,
"rewards/format_reward": 1.0,
"step": 942
},
{
"clip_ratio": 0.0,
"completion_length": 445.6875,
"epoch": 1.9645833333333333,
"grad_norm": 1.1922998428344727,
"kl": 0.248046875,
"learning_rate": 3.451388888888889e-07,
"loss": 0.0002,
"reward": 1.71875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.71875,
"rewards/format_reward": 1.0,
"step": 943
},
{
"clip_ratio": 0.0,
"completion_length": 475.03125,
"epoch": 1.9666666666666668,
"grad_norm": 1.0037637948989868,
"kl": 0.243896484375,
"learning_rate": 3.4444444444444444e-07,
"loss": 0.0002,
"reward": 0.78125,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.28125,
"rewards/format_reward": 0.5,
"step": 944
},
{
"clip_ratio": 0.0,
"completion_length": 426.03125,
"epoch": 1.96875,
"grad_norm": 0.013457462191581726,
"kl": 0.25634765625,
"learning_rate": 3.4375e-07,
"loss": 0.0003,
"reward": 1.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.75,
"step": 945
},
{
"clip_ratio": 0.0,
"completion_length": 448.9375,
"epoch": 1.9708333333333332,
"grad_norm": 0.008211650885641575,
"kl": 0.254150390625,
"learning_rate": 3.4305555555555554e-07,
"loss": 0.0003,
"reward": 1.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.5,
"step": 946
},
{
"clip_ratio": 0.0,
"completion_length": 440.15625,
"epoch": 1.9729166666666667,
"grad_norm": 1.1255629062652588,
"kl": 0.257568359375,
"learning_rate": 3.423611111111111e-07,
"loss": 0.0003,
"reward": 0.96875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.46875,
"rewards/format_reward": 0.5,
"step": 947
},
{
"clip_ratio": 0.0,
"completion_length": 451.625,
"epoch": 1.975,
"grad_norm": 1.566752552986145,
"kl": 0.25927734375,
"learning_rate": 3.4166666666666664e-07,
"loss": 0.0003,
"reward": 0.71875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.21875,
"rewards/format_reward": 0.5,
"step": 948
},
{
"clip_ratio": 0.0,
"completion_length": 425.75,
"epoch": 1.9770833333333333,
"grad_norm": 1.967722773551941,
"kl": 0.250244140625,
"learning_rate": 3.409722222222222e-07,
"loss": 0.0003,
"reward": 1.0,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.75,
"step": 949
},
{
"clip_ratio": 0.0,
"completion_length": 433.15625,
"epoch": 1.9791666666666665,
"grad_norm": 1.687232255935669,
"kl": 0.24951171875,
"learning_rate": 3.402777777777778e-07,
"loss": 0.0002,
"reward": 1.0625,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.0625,
"rewards/format_reward": 1.0,
"step": 950
},
{
"clip_ratio": 0.0,
"completion_length": 454.75,
"epoch": 1.98125,
"grad_norm": 0.010934803634881973,
"kl": 0.253662109375,
"learning_rate": 3.3958333333333335e-07,
"loss": 0.0003,
"reward": 2.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 1.0,
"rewards/format_reward": 1.0,
"step": 951
},
{
"clip_ratio": 0.0,
"completion_length": 437.3125,
"epoch": 1.9833333333333334,
"grad_norm": 0.014278494752943516,
"kl": 0.259521484375,
"learning_rate": 3.388888888888889e-07,
"loss": 0.0003,
"reward": 1.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.75,
"step": 952
},
{
"clip_ratio": 0.0,
"completion_length": 471.875,
"epoch": 1.9854166666666666,
"grad_norm": 0.008121107704937458,
"kl": 0.25244140625,
"learning_rate": 3.3819444444444445e-07,
"loss": 0.0003,
"reward": 1.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.75,
"step": 953
},
{
"clip_ratio": 0.0,
"completion_length": 456.09375,
"epoch": 1.9875,
"grad_norm": 0.00813852995634079,
"kl": 0.253173828125,
"learning_rate": 3.375e-07,
"loss": 0.0003,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.25,
"step": 954
},
{
"clip_ratio": 0.0,
"completion_length": 442.3125,
"epoch": 1.9895833333333335,
"grad_norm": 0.009365586563944817,
"kl": 0.260498046875,
"learning_rate": 3.3680555555555555e-07,
"loss": 0.0003,
"reward": 2.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 1.0,
"rewards/format_reward": 1.0,
"step": 955
},
{
"clip_ratio": 0.0,
"completion_length": 437.5625,
"epoch": 1.9916666666666667,
"grad_norm": 1.130090594291687,
"kl": 0.24072265625,
"learning_rate": 3.361111111111111e-07,
"loss": 0.0002,
"reward": 0.78125,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.28125,
"rewards/format_reward": 0.5,
"step": 956
},
{
"clip_ratio": 0.0,
"completion_length": 447.9375,
"epoch": 1.99375,
"grad_norm": 0.0085580600425601,
"kl": 0.256103515625,
"learning_rate": 3.3541666666666665e-07,
"loss": 0.0003,
"reward": 1.5,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.75,
"step": 957
},
{
"clip_ratio": 0.0,
"completion_length": 456.75,
"epoch": 1.9958333333333333,
"grad_norm": 0.010669535025954247,
"kl": 0.244140625,
"learning_rate": 3.347222222222222e-07,
"loss": 0.0002,
"reward": 0.75,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.75,
"step": 958
},
{
"clip_ratio": 0.0,
"completion_length": 449.84375,
"epoch": 1.9979166666666668,
"grad_norm": 0.01216217689216137,
"kl": 0.26416015625,
"learning_rate": 3.3402777777777775e-07,
"loss": 0.0003,
"reward": 1.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.5,
"step": 959
},
{
"clip_ratio": 0.0,
"completion_length": 456.8125,
"epoch": 2.0,
"grad_norm": 0.007194915786385536,
"kl": 0.24365234375,
"learning_rate": 3.333333333333333e-07,
"loss": 0.0002,
"reward": 1.5,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.75,
"step": 960
},
{
"clip_ratio": 0.0,
"completion_length": 459.1875,
"epoch": 2.002083333333333,
"grad_norm": 0.007638991344720125,
"kl": 0.243408203125,
"learning_rate": 3.3263888888888885e-07,
"loss": 0.0002,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"step": 961
},
{
"clip_ratio": 0.0,
"completion_length": 439.03125,
"epoch": 2.004166666666667,
"grad_norm": 0.00782975647598505,
"kl": 0.250732421875,
"learning_rate": 3.319444444444444e-07,
"loss": 0.0003,
"reward": 2.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 1.0,
"rewards/format_reward": 1.0,
"step": 962
},
{
"clip_ratio": 0.0,
"completion_length": 411.875,
"epoch": 2.00625,
"grad_norm": 3.1766531467437744,
"kl": 0.244384765625,
"learning_rate": 3.3124999999999995e-07,
"loss": 0.0002,
"reward": 1.46875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.71875,
"rewards/format_reward": 0.75,
"step": 963
},
{
"clip_ratio": 0.0,
"completion_length": 448.84375,
"epoch": 2.0083333333333333,
"grad_norm": 1.5000786781311035,
"kl": 0.253173828125,
"learning_rate": 3.3055555555555556e-07,
"loss": 0.0003,
"reward": 0.96875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.21875,
"rewards/format_reward": 0.75,
"step": 964
},
{
"clip_ratio": 0.0,
"completion_length": 472.75,
"epoch": 2.0104166666666665,
"grad_norm": 1.7209105491638184,
"kl": 0.2392578125,
"learning_rate": 3.298611111111111e-07,
"loss": 0.0002,
"reward": 1.75,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 1.0,
"step": 965
},
{
"clip_ratio": 0.0,
"completion_length": 450.90625,
"epoch": 2.0125,
"grad_norm": 1.6055102348327637,
"kl": 0.24072265625,
"learning_rate": 3.2916666666666666e-07,
"loss": 0.0002,
"reward": 1.6875,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.6875,
"rewards/format_reward": 1.0,
"step": 966
},
{
"clip_ratio": 0.0,
"completion_length": 457.4375,
"epoch": 2.0145833333333334,
"grad_norm": 1.1209073066711426,
"kl": 0.248291015625,
"learning_rate": 3.284722222222222e-07,
"loss": 0.0002,
"reward": 1.21875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.46875,
"rewards/format_reward": 0.75,
"step": 967
},
{
"clip_ratio": 0.0,
"completion_length": 444.40625,
"epoch": 2.0166666666666666,
"grad_norm": 0.009695399552583694,
"kl": 0.26416015625,
"learning_rate": 3.2777777777777776e-07,
"loss": 0.0003,
"reward": 1.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.5,
"step": 968
},
{
"clip_ratio": 0.0,
"completion_length": 451.6875,
"epoch": 2.01875,
"grad_norm": 1.2274049520492554,
"kl": 0.2587890625,
"learning_rate": 3.270833333333333e-07,
"loss": 0.0003,
"reward": 1.21875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.46875,
"rewards/format_reward": 0.75,
"step": 969
},
{
"clip_ratio": 0.0,
"completion_length": 445.4375,
"epoch": 2.0208333333333335,
"grad_norm": 0.008248819038271904,
"kl": 0.244140625,
"learning_rate": 3.2638888888888886e-07,
"loss": 0.0002,
"reward": 1.75,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 1.0,
"step": 970
},
{
"clip_ratio": 0.0,
"completion_length": 446.28125,
"epoch": 2.0229166666666667,
"grad_norm": 1.0125038623809814,
"kl": 0.23828125,
"learning_rate": 3.2569444444444446e-07,
"loss": 0.0002,
"reward": 1.46875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.71875,
"rewards/format_reward": 0.75,
"step": 971
},
{
"clip_ratio": 0.0,
"completion_length": 438.46875,
"epoch": 2.025,
"grad_norm": 0.007286733016371727,
"kl": 0.2431640625,
"learning_rate": 3.25e-07,
"loss": 0.0002,
"reward": 0.5,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.25,
"step": 972
},
{
"clip_ratio": 0.0,
"completion_length": 478.0625,
"epoch": 2.027083333333333,
"grad_norm": 0.018277425318956375,
"kl": 0.2314453125,
"learning_rate": 3.2430555555555556e-07,
"loss": 0.0002,
"reward": 1.5,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.75,
"step": 973
},
{
"clip_ratio": 0.0,
"completion_length": 456.9375,
"epoch": 2.029166666666667,
"grad_norm": 0.007195204496383667,
"kl": 0.24560546875,
"learning_rate": 3.236111111111111e-07,
"loss": 0.0002,
"reward": 1.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.5,
"step": 974
},
{
"clip_ratio": 0.0,
"completion_length": 419.4375,
"epoch": 2.03125,
"grad_norm": 0.008570291101932526,
"kl": 0.2607421875,
"learning_rate": 3.2291666666666666e-07,
"loss": 0.0003,
"reward": 1.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.75,
"step": 975
},
{
"clip_ratio": 0.0,
"completion_length": 447.1875,
"epoch": 2.033333333333333,
"grad_norm": 0.007945443503558636,
"kl": 0.244873046875,
"learning_rate": 3.222222222222222e-07,
"loss": 0.0002,
"reward": 1.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.5,
"step": 976
},
{
"clip_ratio": 0.0,
"completion_length": 451.53125,
"epoch": 2.035416666666667,
"grad_norm": 1.0743132829666138,
"kl": 0.26318359375,
"learning_rate": 3.2152777777777776e-07,
"loss": 0.0003,
"reward": 0.46875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.21875,
"rewards/format_reward": 0.25,
"step": 977
},
{
"clip_ratio": 0.0,
"completion_length": 468.875,
"epoch": 2.0375,
"grad_norm": 1.5062912702560425,
"kl": 0.2529296875,
"learning_rate": 3.2083333333333337e-07,
"loss": 0.0003,
"reward": 1.75,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 1.0,
"step": 978
},
{
"clip_ratio": 0.0,
"completion_length": 460.0,
"epoch": 2.0395833333333333,
"grad_norm": 0.0329461507499218,
"kl": 0.2919921875,
"learning_rate": 3.2013888888888886e-07,
"loss": 0.0003,
"reward": 1.5,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 1.0,
"step": 979
},
{
"clip_ratio": 0.0,
"completion_length": 437.625,
"epoch": 2.0416666666666665,
"grad_norm": 1.2301101684570312,
"kl": 0.24560546875,
"learning_rate": 3.194444444444444e-07,
"loss": 0.0002,
"reward": 1.96875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.96875,
"rewards/format_reward": 1.0,
"step": 980
},
{
"clip_ratio": 0.0,
"completion_length": 465.21875,
"epoch": 2.04375,
"grad_norm": 1.5222910642623901,
"kl": 0.246826171875,
"learning_rate": 3.1874999999999997e-07,
"loss": 0.0002,
"reward": 1.71875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.71875,
"rewards/format_reward": 1.0,
"step": 981
},
{
"clip_ratio": 0.0,
"completion_length": 459.4375,
"epoch": 2.0458333333333334,
"grad_norm": 0.010759882628917694,
"kl": 0.23876953125,
"learning_rate": 3.180555555555555e-07,
"loss": 0.0002,
"reward": 0.75,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.5,
"step": 982
},
{
"clip_ratio": 0.0,
"completion_length": 419.53125,
"epoch": 2.0479166666666666,
"grad_norm": 0.007903813384473324,
"kl": 0.251953125,
"learning_rate": 3.1736111111111107e-07,
"loss": 0.0003,
"reward": 1.5,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.75,
"step": 983
},
{
"clip_ratio": 0.0,
"completion_length": 444.15625,
"epoch": 2.05,
"grad_norm": 1.4439796209335327,
"kl": 0.262451171875,
"learning_rate": 3.166666666666666e-07,
"loss": 0.0003,
"reward": 1.9375,
"reward_std": 0.1157275140285492,
"rewards/accuracy_reward": 0.9375,
"rewards/format_reward": 1.0,
"step": 984
},
{
"clip_ratio": 0.0,
"completion_length": 433.90625,
"epoch": 2.0520833333333335,
"grad_norm": 0.029047423973679543,
"kl": 0.266845703125,
"learning_rate": 3.159722222222222e-07,
"loss": 0.0003,
"reward": 1.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.75,
"step": 985
},
{
"clip_ratio": 0.0,
"completion_length": 470.84375,
"epoch": 2.0541666666666667,
"grad_norm": 1.1269396543502808,
"kl": 0.2607421875,
"learning_rate": 3.1527777777777777e-07,
"loss": 0.0003,
"reward": 1.3125,
"reward_std": 0.1157275140285492,
"rewards/accuracy_reward": 0.5625,
"rewards/format_reward": 0.75,
"step": 986
},
{
"clip_ratio": 0.0,
"completion_length": 471.25,
"epoch": 2.05625,
"grad_norm": 0.007986017502844334,
"kl": 0.2509765625,
"learning_rate": 3.145833333333333e-07,
"loss": 0.0003,
"reward": 2.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 1.0,
"rewards/format_reward": 1.0,
"step": 987
},
{
"clip_ratio": 0.0,
"completion_length": 449.75,
"epoch": 2.058333333333333,
"grad_norm": 0.007769202347844839,
"kl": 0.2490234375,
"learning_rate": 3.1388888888888887e-07,
"loss": 0.0002,
"reward": 1.75,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 1.0,
"step": 988
},
{
"clip_ratio": 0.0,
"completion_length": 466.09375,
"epoch": 2.060416666666667,
"grad_norm": 1.3015766143798828,
"kl": 0.24951171875,
"learning_rate": 3.131944444444444e-07,
"loss": 0.0002,
"reward": 0.46875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.21875,
"rewards/format_reward": 0.25,
"step": 989
},
{
"clip_ratio": 0.0,
"completion_length": 476.4375,
"epoch": 2.0625,
"grad_norm": 1.3159650564193726,
"kl": 0.251953125,
"learning_rate": 3.1249999999999997e-07,
"loss": 0.0003,
"reward": 1.1875,
"reward_std": 0.1157275140285492,
"rewards/accuracy_reward": 0.4375,
"rewards/format_reward": 0.75,
"step": 990
},
{
"clip_ratio": 0.0,
"completion_length": 463.0,
"epoch": 2.064583333333333,
"grad_norm": 0.010242442600429058,
"kl": 0.256591796875,
"learning_rate": 3.118055555555555e-07,
"loss": 0.0003,
"reward": 1.75,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 1.0,
"step": 991
},
{
"clip_ratio": 0.0,
"completion_length": 462.53125,
"epoch": 2.066666666666667,
"grad_norm": 1.5279037952423096,
"kl": 0.26611328125,
"learning_rate": 3.111111111111111e-07,
"loss": 0.0003,
"reward": 1.0,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.75,
"step": 992
},
{
"clip_ratio": 0.0,
"completion_length": 461.21875,
"epoch": 2.06875,
"grad_norm": 0.01802394911646843,
"kl": 0.258544921875,
"learning_rate": 3.104166666666667e-07,
"loss": 0.0003,
"reward": 1.5,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.75,
"step": 993
},
{
"clip_ratio": 0.0,
"completion_length": 458.53125,
"epoch": 2.0708333333333333,
"grad_norm": 1.1756128072738647,
"kl": 0.2548828125,
"learning_rate": 3.0972222222222223e-07,
"loss": 0.0003,
"reward": 0.96875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.46875,
"rewards/format_reward": 0.5,
"step": 994
},
{
"clip_ratio": 0.0,
"completion_length": 462.03125,
"epoch": 2.0729166666666665,
"grad_norm": 1.3839462995529175,
"kl": 0.251220703125,
"learning_rate": 3.090277777777778e-07,
"loss": 0.0003,
"reward": 0.96875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.46875,
"rewards/format_reward": 0.5,
"step": 995
},
{
"clip_ratio": 0.0,
"completion_length": 451.9375,
"epoch": 2.075,
"grad_norm": 1.179376482963562,
"kl": 0.243408203125,
"learning_rate": 3.0833333333333333e-07,
"loss": 0.0002,
"reward": 1.21875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.46875,
"rewards/format_reward": 0.75,
"step": 996
},
{
"clip_ratio": 0.0,
"completion_length": 423.6875,
"epoch": 2.0770833333333334,
"grad_norm": 0.011729571036994457,
"kl": 0.235107421875,
"learning_rate": 3.076388888888889e-07,
"loss": 0.0002,
"reward": 1.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.5,
"step": 997
},
{
"clip_ratio": 0.0,
"completion_length": 460.0,
"epoch": 2.0791666666666666,
"grad_norm": 0.007601437624543905,
"kl": 0.240234375,
"learning_rate": 3.0694444444444443e-07,
"loss": 0.0002,
"reward": 1.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.75,
"step": 998
},
{
"clip_ratio": 0.0,
"completion_length": 452.53125,
"epoch": 2.08125,
"grad_norm": 0.007883809506893158,
"kl": 0.255859375,
"learning_rate": 3.0625000000000003e-07,
"loss": 0.0003,
"reward": 1.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.5,
"step": 999
},
{
"clip_ratio": 0.0,
"completion_length": 464.34375,
"epoch": 2.0833333333333335,
"grad_norm": 0.01050996221601963,
"kl": 0.250244140625,
"learning_rate": 3.055555555555556e-07,
"loss": 0.0003,
"reward": 1.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.75,
"step": 1000
}
],
"logging_steps": 1.0,
"max_steps": 1440,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 200,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}