Qwen2.5-1.5B-Open-R1-GRPO / trainer_state.json
cameronphchen's picture
Model save
9cc072c verified
raw
history blame
190 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 200.0,
"eval_steps": 10,
"global_step": 400,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio": 0.0,
"completion_length": 459.375,
"epoch": 0.5,
"grad_norm": 2.876323361721518,
"kl": 0.0,
"learning_rate": 5.000000000000001e-07,
"loss": 0.3004,
"reward": 0.65625,
"reward_std": 0.8805903792381287,
"rewards/accuracy_reward_staging": 0.0,
"rewards/format_reward": 0.28125,
"rewards/format_reward_staging": 0.375,
"step": 1
},
{
"clip_ratio": 0.0,
"completion_length": 472.78125,
"epoch": 1.0,
"grad_norm": 2.1678805572948407,
"kl": 0.0,
"learning_rate": 1.0000000000000002e-06,
"loss": 0.2723,
"reward": 0.859375,
"reward_std": 0.8921410292387009,
"rewards/accuracy_reward_staging": 0.0,
"rewards/format_reward": 0.34375,
"rewards/format_reward_staging": 0.515625,
"step": 2
},
{
"clip_ratio": 0.0,
"completion_length": 402.53125,
"epoch": 1.5,
"grad_norm": 2.685905075565771,
"kl": 0.000606536865234375,
"learning_rate": 1.5e-06,
"loss": 0.2138,
"reward": 0.890625,
"reward_std": 0.9797716289758682,
"rewards/accuracy_reward_staging": 0.0,
"rewards/format_reward": 0.421875,
"rewards/format_reward_staging": 0.46875,
"step": 3
},
{
"clip_ratio": 0.0,
"completion_length": 435.40625,
"epoch": 2.0,
"grad_norm": 2.4665649457110215,
"kl": 0.0006070137023925781,
"learning_rate": 2.0000000000000003e-06,
"loss": 0.2588,
"reward": 0.84375,
"reward_std": 1.4599270820617676,
"rewards/accuracy_reward_staging": 0.015625,
"rewards/format_reward": 0.296875,
"rewards/format_reward_staging": 0.390625,
"step": 4
},
{
"clip_ratio": 0.0,
"completion_length": 441.5,
"epoch": 2.5,
"grad_norm": 2.7610545552229273,
"kl": 0.0015697479248046875,
"learning_rate": 2.5e-06,
"loss": 0.3144,
"reward": 0.875,
"reward_std": 0.8861797749996185,
"rewards/accuracy_reward_staging": 0.0,
"rewards/format_reward": 0.375,
"rewards/format_reward_staging": 0.5,
"step": 5
},
{
"clip_ratio": 0.0,
"completion_length": 538.15625,
"epoch": 3.0,
"grad_norm": 1.9363663555232191,
"kl": 0.006195068359375,
"learning_rate": 3e-06,
"loss": 0.208,
"reward": 0.734375,
"reward_std": 0.9395850598812103,
"rewards/accuracy_reward_staging": 0.0,
"rewards/format_reward": 0.328125,
"rewards/format_reward_staging": 0.40625,
"step": 6
},
{
"clip_ratio": 0.0,
"completion_length": 438.53125,
"epoch": 3.5,
"grad_norm": 40.059369045816226,
"kl": 0.00463104248046875,
"learning_rate": 3.5e-06,
"loss": 0.2105,
"reward": 0.75,
"reward_std": 0.8607002794742584,
"rewards/accuracy_reward_staging": 0.0,
"rewards/format_reward": 0.28125,
"rewards/format_reward_staging": 0.46875,
"step": 7
},
{
"clip_ratio": 0.0,
"completion_length": 444.6875,
"epoch": 4.0,
"grad_norm": 2.042344469155283,
"kl": 0.0260009765625,
"learning_rate": 4.000000000000001e-06,
"loss": 0.2712,
"reward": 0.96875,
"reward_std": 0.9265350848436356,
"rewards/accuracy_reward_staging": 0.0,
"rewards/format_reward": 0.390625,
"rewards/format_reward_staging": 0.578125,
"step": 8
},
{
"clip_ratio": 0.0,
"completion_length": 436.828125,
"epoch": 4.5,
"grad_norm": 1.8209399338820504,
"kl": 0.029571533203125,
"learning_rate": 4.5e-06,
"loss": 0.3164,
"reward": 1.0,
"reward_std": 0.9064923822879791,
"rewards/accuracy_reward_staging": 0.0,
"rewards/format_reward": 0.421875,
"rewards/format_reward_staging": 0.578125,
"step": 9
},
{
"epoch": 5.0,
"grad_norm": 2.6748764898489186,
"learning_rate": 5e-06,
"loss": 0.3694,
"step": 10
},
{
"epoch": 5.0,
"eval_clip_ratio": 0.0,
"eval_completion_length": 323.546875,
"eval_kl": 0.1407470703125,
"eval_loss": 0.21327295899391174,
"eval_reward": 1.625,
"eval_reward_std": 0.6777683570981026,
"eval_rewards/accuracy_reward_staging": 0.0,
"eval_rewards/format_reward": 0.78125,
"eval_rewards/format_reward_staging": 0.84375,
"eval_runtime": 40.7929,
"eval_samples_per_second": 0.196,
"eval_steps_per_second": 0.025,
"step": 10
},
{
"clip_ratio": 0.0,
"completion_length": 387.734375,
"epoch": 5.5,
"grad_norm": 3.5720394192584792,
"kl": 0.1397705078125,
"learning_rate": 5.500000000000001e-06,
"loss": 0.4228,
"reward": 1.421875,
"reward_std": 1.1599705293774605,
"rewards/accuracy_reward_staging": 0.0078125,
"rewards/format_reward": 0.6484375,
"rewards/format_reward_staging": 0.6953125,
"step": 11
},
{
"clip_ratio": 0.0,
"completion_length": 331.90625,
"epoch": 6.0,
"grad_norm": 3.665359231516559,
"kl": 0.20849609375,
"learning_rate": 6e-06,
"loss": 0.5381,
"reward": 1.640625,
"reward_std": 0.5743362456560135,
"rewards/accuracy_reward_staging": 0.0,
"rewards/format_reward": 0.734375,
"rewards/format_reward_staging": 0.90625,
"step": 12
},
{
"clip_ratio": 0.0,
"completion_length": 303.671875,
"epoch": 6.5,
"grad_norm": 2.5383946033246154,
"kl": 0.239501953125,
"learning_rate": 6.5000000000000004e-06,
"loss": 0.2558,
"reward": 1.921875,
"reward_std": 1.0853875279426575,
"rewards/accuracy_reward_staging": 0.015625,
"rewards/format_reward": 0.859375,
"rewards/format_reward_staging": 0.90625,
"step": 13
},
{
"clip_ratio": 0.0,
"completion_length": 253.375,
"epoch": 7.0,
"grad_norm": 2.9224354012117546,
"kl": 0.36865234375,
"learning_rate": 7e-06,
"loss": 0.1188,
"reward": 1.90625,
"reward_std": 0.3234764039516449,
"rewards/accuracy_reward_staging": 0.0,
"rewards/format_reward": 0.9375,
"rewards/format_reward_staging": 0.96875,
"step": 14
},
{
"clip_ratio": 0.0,
"completion_length": 229.6875,
"epoch": 7.5,
"grad_norm": 134.13549246823817,
"kl": 2.4775390625,
"learning_rate": 7.500000000000001e-06,
"loss": 0.25,
"reward": 1.90625,
"reward_std": 0.29578250646591187,
"rewards/accuracy_reward_staging": 0.0,
"rewards/format_reward": 0.9375,
"rewards/format_reward_staging": 0.96875,
"step": 15
},
{
"clip_ratio": 0.0,
"completion_length": 201.375,
"epoch": 8.0,
"grad_norm": 3.387910323346065,
"kl": 0.7109375,
"learning_rate": 8.000000000000001e-06,
"loss": 0.0706,
"reward": 1.9375,
"reward_std": 0.11180339753627777,
"rewards/accuracy_reward_staging": 0.0,
"rewards/format_reward": 0.9375,
"rewards/format_reward_staging": 1.0,
"step": 16
},
{
"clip_ratio": 0.0,
"completion_length": 244.984375,
"epoch": 8.5,
"grad_norm": 4.308168061347317,
"kl": 0.810546875,
"learning_rate": 8.5e-06,
"loss": 0.0776,
"reward": 1.984375,
"reward_std": 0.0625,
"rewards/accuracy_reward_staging": 0.0,
"rewards/format_reward": 0.984375,
"rewards/format_reward_staging": 1.0,
"step": 17
},
{
"clip_ratio": 0.0,
"completion_length": 211.59375,
"epoch": 9.0,
"grad_norm": 2.9369254269642333,
"kl": 1.34228515625,
"learning_rate": 9e-06,
"loss": -0.0485,
"reward": 1.921875,
"reward_std": 0.11967839300632477,
"rewards/accuracy_reward_staging": 0.0,
"rewards/format_reward": 0.9375,
"rewards/format_reward_staging": 0.984375,
"step": 18
},
{
"clip_ratio": 0.0,
"completion_length": 313.828125,
"epoch": 9.5,
"grad_norm": 1.4789470328059777,
"kl": 0.5146484375,
"learning_rate": 9.5e-06,
"loss": 0.0547,
"reward": 1.96875,
"reward_std": 0.08539125323295593,
"rewards/accuracy_reward_staging": 0.0,
"rewards/format_reward": 0.984375,
"rewards/format_reward_staging": 0.984375,
"step": 19
},
{
"epoch": 10.0,
"grad_norm": 80.10476715951108,
"learning_rate": 1e-05,
"loss": 0.0398,
"step": 20
},
{
"epoch": 10.0,
"eval_clip_ratio": 0.0,
"eval_completion_length": 278.5546875,
"eval_kl": 0.609130859375,
"eval_loss": -0.0038001190405339003,
"eval_reward": 2.1015625,
"eval_reward_std": 0.5469204634428024,
"eval_rewards/accuracy_reward_staging": 0.015625,
"eval_rewards/format_reward": 0.953125,
"eval_rewards/format_reward_staging": 0.9921875,
"eval_runtime": 30.1338,
"eval_samples_per_second": 0.265,
"eval_steps_per_second": 0.033,
"step": 20
},
{
"clip_ratio": 0.0,
"completion_length": 268.6953125,
"epoch": 10.5,
"grad_norm": 2.0467738114975678,
"kl": 4.900390625,
"learning_rate": 1.0500000000000001e-05,
"loss": 0.0361,
"reward": 2.25,
"reward_std": 1.1723129898309708,
"rewards/accuracy_reward_staging": 0.03125,
"rewards/format_reward": 0.9453125,
"rewards/format_reward_staging": 0.9921875,
"step": 21
},
{
"clip_ratio": 0.0,
"completion_length": 304.1875,
"epoch": 11.0,
"grad_norm": 1.8964954924900732,
"kl": 0.7880859375,
"learning_rate": 1.1000000000000001e-05,
"loss": -0.1894,
"reward": 2.078125,
"reward_std": 0.8426409065723419,
"rewards/accuracy_reward_staging": 0.015625,
"rewards/format_reward": 0.921875,
"rewards/format_reward_staging": 1.0,
"step": 22
},
{
"clip_ratio": 0.0,
"completion_length": 266.6875,
"epoch": 11.5,
"grad_norm": 1.6929978014913423,
"kl": 0.779296875,
"learning_rate": 1.15e-05,
"loss": -0.0221,
"reward": 1.90625,
"reward_std": 0.24866947531700134,
"rewards/accuracy_reward_staging": 0.0,
"rewards/format_reward": 0.90625,
"rewards/format_reward_staging": 1.0,
"step": 23
},
{
"clip_ratio": 0.0,
"completion_length": 259.125,
"epoch": 12.0,
"grad_norm": 1.7383655787319439,
"kl": 0.7666015625,
"learning_rate": 1.2e-05,
"loss": 0.0302,
"reward": 3.0625,
"reward_std": 3.1296846866607666,
"rewards/accuracy_reward_staging": 0.109375,
"rewards/format_reward": 0.96875,
"rewards/format_reward_staging": 1.0,
"step": 24
},
{
"clip_ratio": 0.0,
"completion_length": 233.625,
"epoch": 12.5,
"grad_norm": 4.648191964945557,
"kl": 1.6943359375,
"learning_rate": 1.25e-05,
"loss": -0.0409,
"reward": 2.21875,
"reward_std": 1.4246117174625397,
"rewards/accuracy_reward_staging": 0.03125,
"rewards/format_reward": 0.9375,
"rewards/format_reward_staging": 0.96875,
"step": 25
},
{
"clip_ratio": 0.0,
"completion_length": 239.625,
"epoch": 13.0,
"grad_norm": 1.4292059587187729,
"kl": 0.57421875,
"learning_rate": 1.3000000000000001e-05,
"loss": -0.0077,
"reward": 2.734375,
"reward_std": 1.3342310190200806,
"rewards/accuracy_reward_staging": 0.078125,
"rewards/format_reward": 0.96875,
"rewards/format_reward_staging": 0.984375,
"step": 26
},
{
"clip_ratio": 0.0,
"completion_length": 223.390625,
"epoch": 13.5,
"grad_norm": 2.513864581190583,
"kl": 1.228515625,
"learning_rate": 1.3500000000000001e-05,
"loss": 0.0735,
"reward": 2.921875,
"reward_std": 2.031271666288376,
"rewards/accuracy_reward_staging": 0.109375,
"rewards/format_reward": 0.859375,
"rewards/format_reward_staging": 0.96875,
"step": 27
},
{
"clip_ratio": 0.0,
"completion_length": 221.65625,
"epoch": 14.0,
"grad_norm": 1.5115163324009253,
"kl": 0.724609375,
"learning_rate": 1.4e-05,
"loss": 0.0464,
"reward": 2.078125,
"reward_std": 0.6708659529685974,
"rewards/accuracy_reward_staging": 0.015625,
"rewards/format_reward": 0.9375,
"rewards/format_reward_staging": 0.984375,
"step": 28
},
{
"clip_ratio": 0.0,
"completion_length": 269.03125,
"epoch": 14.5,
"grad_norm": 1.7257409221598474,
"kl": 0.5712890625,
"learning_rate": 1.45e-05,
"loss": -0.0904,
"reward": 3.34375,
"reward_std": 2.8355378210544586,
"rewards/accuracy_reward_staging": 0.140625,
"rewards/format_reward": 0.9375,
"rewards/format_reward_staging": 1.0,
"step": 29
},
{
"epoch": 15.0,
"grad_norm": 2.5117601129661438,
"learning_rate": 1.5000000000000002e-05,
"loss": -0.0204,
"step": 30
},
{
"epoch": 15.0,
"eval_clip_ratio": 0.0,
"eval_completion_length": 292.3984375,
"eval_kl": 0.724365234375,
"eval_loss": 0.037196554243564606,
"eval_reward": 3.015625,
"eval_reward_std": 2.8644309490919113,
"eval_rewards/accuracy_reward_staging": 0.1171875,
"eval_rewards/format_reward": 0.8984375,
"eval_rewards/format_reward_staging": 0.9453125,
"eval_runtime": 35.4144,
"eval_samples_per_second": 0.226,
"eval_steps_per_second": 0.028,
"step": 30
},
{
"clip_ratio": 0.0,
"completion_length": 258.734375,
"epoch": 15.5,
"grad_norm": 2.0989197265306423,
"kl": 0.6796875,
"learning_rate": 1.55e-05,
"loss": -0.1012,
"reward": 2.5390625,
"reward_std": 1.8101423382759094,
"rewards/accuracy_reward_staging": 0.0625,
"rewards/format_reward": 0.921875,
"rewards/format_reward_staging": 0.9921875,
"step": 31
},
{
"clip_ratio": 0.0,
"completion_length": 298.40625,
"epoch": 16.0,
"grad_norm": 1.1476664560432441,
"kl": 0.611328125,
"learning_rate": 1.6000000000000003e-05,
"loss": -0.0672,
"reward": 2.28125,
"reward_std": 0.93930384516716,
"rewards/accuracy_reward_staging": 0.03125,
"rewards/format_reward": 0.96875,
"rewards/format_reward_staging": 1.0,
"step": 32
},
{
"clip_ratio": 0.0,
"completion_length": 268.296875,
"epoch": 16.5,
"grad_norm": 19.21670679033428,
"kl": 1.50927734375,
"learning_rate": 1.65e-05,
"loss": 0.0674,
"reward": 2.296875,
"reward_std": 1.257249653339386,
"rewards/accuracy_reward_staging": 0.03125,
"rewards/format_reward": 0.984375,
"rewards/format_reward_staging": 1.0,
"step": 33
},
{
"clip_ratio": 0.0,
"completion_length": 282.3125,
"epoch": 17.0,
"grad_norm": 1.4246525591844976,
"kl": 0.56787109375,
"learning_rate": 1.7e-05,
"loss": -0.0194,
"reward": 3.8125,
"reward_std": 3.774241268634796,
"rewards/accuracy_reward_staging": 0.1875,
"rewards/format_reward": 0.953125,
"rewards/format_reward_staging": 0.984375,
"step": 34
},
{
"clip_ratio": 0.0,
"completion_length": 319.765625,
"epoch": 17.5,
"grad_norm": 1.9762168820196657,
"kl": 0.77197265625,
"learning_rate": 1.7500000000000002e-05,
"loss": 0.2802,
"reward": 3.0,
"reward_std": 2.2321222722530365,
"rewards/accuracy_reward_staging": 0.109375,
"rewards/format_reward": 0.953125,
"rewards/format_reward_staging": 0.953125,
"step": 35
},
{
"clip_ratio": 0.0,
"completion_length": 315.3125,
"epoch": 18.0,
"grad_norm": 1.342401716752108,
"kl": 0.7529296875,
"learning_rate": 1.8e-05,
"loss": 0.0864,
"reward": 3.796875,
"reward_std": 2.384265750646591,
"rewards/accuracy_reward_staging": 0.1875,
"rewards/format_reward": 0.9375,
"rewards/format_reward_staging": 0.984375,
"step": 36
},
{
"clip_ratio": 0.0,
"completion_length": 180.671875,
"epoch": 18.5,
"grad_norm": 1.3852885333924034,
"kl": 0.7998046875,
"learning_rate": 1.8500000000000002e-05,
"loss": -0.0281,
"reward": 3.375,
"reward_std": 2.220172733068466,
"rewards/accuracy_reward_staging": 0.140625,
"rewards/format_reward": 1.0,
"rewards/format_reward_staging": 0.96875,
"step": 37
},
{
"clip_ratio": 0.0,
"completion_length": 153.625,
"epoch": 19.0,
"grad_norm": 1.5314695103803033,
"kl": 0.89453125,
"learning_rate": 1.9e-05,
"loss": -0.001,
"reward": 2.921875,
"reward_std": 1.9802924394607544,
"rewards/accuracy_reward_staging": 0.09375,
"rewards/format_reward": 0.984375,
"rewards/format_reward_staging": 1.0,
"step": 38
},
{
"clip_ratio": 0.0,
"completion_length": 187.234375,
"epoch": 19.5,
"grad_norm": 1.5329342983674943,
"kl": 0.796875,
"learning_rate": 1.95e-05,
"loss": 0.0871,
"reward": 2.40625,
"reward_std": 1.1022064685821533,
"rewards/accuracy_reward_staging": 0.046875,
"rewards/format_reward": 0.984375,
"rewards/format_reward_staging": 0.953125,
"step": 39
},
{
"epoch": 20.0,
"grad_norm": 1.80824958208745,
"learning_rate": 2e-05,
"loss": 0.0547,
"step": 40
},
{
"epoch": 20.0,
"eval_clip_ratio": 0.0,
"eval_completion_length": 199.2421875,
"eval_kl": 0.71337890625,
"eval_loss": 0.02822229452431202,
"eval_reward": 3.6640625,
"eval_reward_std": 2.5834601297974586,
"eval_rewards/accuracy_reward_staging": 0.1953125,
"eval_rewards/format_reward": 0.96875,
"eval_rewards/format_reward_staging": 0.7421875,
"eval_runtime": 28.7471,
"eval_samples_per_second": 0.278,
"eval_steps_per_second": 0.035,
"step": 40
},
{
"clip_ratio": 0.0,
"completion_length": 253.6796875,
"epoch": 20.5,
"grad_norm": 2.494613965798382,
"kl": 0.66796875,
"learning_rate": 1.9999619230641714e-05,
"loss": -0.083,
"reward": 4.09375,
"reward_std": 2.9600732252001762,
"rewards/accuracy_reward_staging": 0.2265625,
"rewards/format_reward": 0.96875,
"rewards/format_reward_staging": 0.859375,
"step": 41
},
{
"clip_ratio": 0.0,
"completion_length": 231.5625,
"epoch": 21.0,
"grad_norm": 1.5230693811732243,
"kl": 1.0322265625,
"learning_rate": 1.9998476951563914e-05,
"loss": 0.0046,
"reward": 3.453125,
"reward_std": 3.205719515681267,
"rewards/accuracy_reward_staging": 0.1875,
"rewards/format_reward": 0.890625,
"rewards/format_reward_staging": 0.6875,
"step": 42
},
{
"clip_ratio": 0.0,
"completion_length": 278.203125,
"epoch": 21.5,
"grad_norm": 1.758870754503535,
"kl": 0.6474609375,
"learning_rate": 1.9996573249755573e-05,
"loss": -0.0531,
"reward": 3.65625,
"reward_std": 2.5534728318452835,
"rewards/accuracy_reward_staging": 0.203125,
"rewards/format_reward": 0.90625,
"rewards/format_reward_staging": 0.71875,
"step": 43
},
{
"clip_ratio": 0.0,
"completion_length": 275.15625,
"epoch": 22.0,
"grad_norm": 2.0919325244919147,
"kl": 0.9189453125,
"learning_rate": 1.999390827019096e-05,
"loss": 0.0602,
"reward": 3.75,
"reward_std": 2.127801224589348,
"rewards/accuracy_reward_staging": 0.203125,
"rewards/format_reward": 0.890625,
"rewards/format_reward_staging": 0.828125,
"step": 44
},
{
"clip_ratio": 0.0,
"completion_length": 328.328125,
"epoch": 22.5,
"grad_norm": 1.5048949701874923,
"kl": 0.705078125,
"learning_rate": 1.999048221581858e-05,
"loss": -0.0272,
"reward": 4.140625,
"reward_std": 2.4673196375370026,
"rewards/accuracy_reward_staging": 0.234375,
"rewards/format_reward": 0.9375,
"rewards/format_reward_staging": 0.859375,
"step": 45
},
{
"clip_ratio": 0.0,
"completion_length": 430.84375,
"epoch": 23.0,
"grad_norm": 5.30560128522589,
"kl": 0.68798828125,
"learning_rate": 1.9986295347545738e-05,
"loss": 0.1258,
"reward": 4.0,
"reward_std": 1.9574655294418335,
"rewards/accuracy_reward_staging": 0.21875,
"rewards/format_reward": 0.921875,
"rewards/format_reward_staging": 0.890625,
"step": 46
},
{
"clip_ratio": 0.0,
"completion_length": 447.90625,
"epoch": 23.5,
"grad_norm": 1.5599503211998336,
"kl": 0.6640625,
"learning_rate": 1.998134798421867e-05,
"loss": 0.0862,
"reward": 3.53125,
"reward_std": 2.2529123574495316,
"rewards/accuracy_reward_staging": 0.1875,
"rewards/format_reward": 0.796875,
"rewards/format_reward_staging": 0.859375,
"step": 47
},
{
"clip_ratio": 0.0,
"completion_length": 517.34375,
"epoch": 24.0,
"grad_norm": 1.2228825438248228,
"kl": 0.6796875,
"learning_rate": 1.9975640502598243e-05,
"loss": 0.0315,
"reward": 2.96875,
"reward_std": 2.65271133184433,
"rewards/accuracy_reward_staging": 0.171875,
"rewards/format_reward": 0.453125,
"rewards/format_reward_staging": 0.796875,
"step": 48
},
{
"clip_ratio": 0.0,
"completion_length": 598.65625,
"epoch": 24.5,
"grad_norm": 1.146901459334409,
"kl": 0.6064453125,
"learning_rate": 1.9969173337331283e-05,
"loss": 0.1387,
"reward": 1.859375,
"reward_std": 1.7595358788967133,
"rewards/accuracy_reward_staging": 0.0625,
"rewards/format_reward": 0.453125,
"rewards/format_reward_staging": 0.78125,
"step": 49
},
{
"epoch": 25.0,
"grad_norm": 1.375237075929609,
"learning_rate": 1.9961946980917457e-05,
"loss": 0.156,
"step": 50
},
{
"epoch": 25.0,
"eval_clip_ratio": 0.0,
"eval_completion_length": 575.0234375,
"eval_kl": 1.018798828125,
"eval_loss": 0.19486893713474274,
"eval_reward": 2.0234375,
"eval_reward_std": 2.9213491678237915,
"eval_rewards/accuracy_reward_staging": 0.1015625,
"eval_rewards/format_reward": 0.3671875,
"eval_rewards/format_reward_staging": 0.640625,
"eval_runtime": 59.4705,
"eval_samples_per_second": 0.135,
"eval_steps_per_second": 0.017,
"step": 50
},
{
"clip_ratio": 0.0,
"completion_length": 562.515625,
"epoch": 25.5,
"grad_norm": 188.4500979533979,
"kl": 2.618408203125,
"learning_rate": 1.9953961983671792e-05,
"loss": 0.5177,
"reward": 2.5,
"reward_std": 3.7192839682102203,
"rewards/accuracy_reward_staging": 0.1484375,
"rewards/format_reward": 0.3671875,
"rewards/format_reward_staging": 0.6484375,
"step": 51
},
{
"clip_ratio": 0.0,
"completion_length": 571.78125,
"epoch": 26.0,
"grad_norm": 7356.139198346828,
"kl": 219.451171875,
"learning_rate": 1.9945218953682736e-05,
"loss": 12.1361,
"reward": 1.1875,
"reward_std": 2.103448197245598,
"rewards/accuracy_reward_staging": 0.046875,
"rewards/format_reward": 0.265625,
"rewards/format_reward_staging": 0.453125,
"step": 52
},
{
"clip_ratio": 0.0,
"completion_length": 631.9375,
"epoch": 26.5,
"grad_norm": 58.63460971785782,
"kl": 2.384765625,
"learning_rate": 1.9935718556765878e-05,
"loss": 0.3151,
"reward": 0.625,
"reward_std": 1.280954971909523,
"rewards/accuracy_reward_staging": 0.015625,
"rewards/format_reward": 0.203125,
"rewards/format_reward_staging": 0.265625,
"step": 53
},
{
"clip_ratio": 0.0,
"completion_length": 685.90625,
"epoch": 27.0,
"grad_norm": 6.628332912526643,
"kl": 1.361328125,
"learning_rate": 1.9925461516413224e-05,
"loss": 0.3519,
"reward": 0.515625,
"reward_std": 1.2405942231416702,
"rewards/accuracy_reward_staging": 0.015625,
"rewards/format_reward": 0.15625,
"rewards/format_reward_staging": 0.203125,
"step": 54
},
{
"clip_ratio": 0.0,
"completion_length": 358.859375,
"epoch": 27.5,
"grad_norm": 3.8474726393370386,
"kl": 3.169921875,
"learning_rate": 1.9914448613738107e-05,
"loss": 0.339,
"reward": 0.78125,
"reward_std": 1.5188637673854828,
"rewards/accuracy_reward_staging": 0.03125,
"rewards/format_reward": 0.203125,
"rewards/format_reward_staging": 0.265625,
"step": 55
},
{
"clip_ratio": 0.0,
"completion_length": 160.28125,
"epoch": 28.0,
"grad_norm": 1711.6146146132392,
"kl": 29.546875,
"learning_rate": 1.9902680687415704e-05,
"loss": 2.1619,
"reward": 1.15625,
"reward_std": 2.088463395833969,
"rewards/accuracy_reward_staging": 0.0625,
"rewards/format_reward": 0.21875,
"rewards/format_reward_staging": 0.3125,
"step": 56
},
{
"clip_ratio": 0.0,
"completion_length": 204.78125,
"epoch": 28.5,
"grad_norm": 172.76666342508884,
"kl": 7.39453125,
"learning_rate": 1.989015863361917e-05,
"loss": 0.6565,
"reward": 1.3125,
"reward_std": 2.61825592815876,
"rewards/accuracy_reward_staging": 0.078125,
"rewards/format_reward": 0.25,
"rewards/format_reward_staging": 0.28125,
"step": 57
},
{
"clip_ratio": 0.0,
"completion_length": 114.875,
"epoch": 29.0,
"grad_norm": 14.841168261507285,
"kl": 4.638671875,
"learning_rate": 1.9876883405951378e-05,
"loss": -0.094,
"reward": 0.78125,
"reward_std": 1.340459167957306,
"rewards/accuracy_reward_staging": 0.03125,
"rewards/format_reward": 0.234375,
"rewards/format_reward_staging": 0.234375,
"step": 58
},
{
"clip_ratio": 0.0,
"completion_length": 159.546875,
"epoch": 29.5,
"grad_norm": 38.76716232869443,
"kl": 3.5234375,
"learning_rate": 1.9862856015372315e-05,
"loss": 0.0135,
"reward": 1.125,
"reward_std": 2.4588640481233597,
"rewards/accuracy_reward_staging": 0.0625,
"rewards/format_reward": 0.28125,
"rewards/format_reward_staging": 0.21875,
"step": 59
},
{
"epoch": 30.0,
"grad_norm": 7.631987331877285,
"learning_rate": 1.9848077530122083e-05,
"loss": 0.0661,
"step": 60
},
{
"epoch": 30.0,
"eval_clip_ratio": 0.0,
"eval_completion_length": 180.6640625,
"eval_kl": 40.0986328125,
"eval_loss": 4.4661993980407715,
"eval_reward": 2.296875,
"eval_reward_std": 2.8048948869109154,
"eval_rewards/accuracy_reward_staging": 0.125,
"eval_rewards/format_reward": 0.5,
"eval_rewards/format_reward_staging": 0.546875,
"eval_runtime": 26.742,
"eval_samples_per_second": 0.299,
"eval_steps_per_second": 0.037,
"step": 60
},
{
"clip_ratio": 0.0,
"completion_length": 175.5703125,
"epoch": 30.5,
"grad_norm": 10.455818966130773,
"kl": 2.056640625,
"learning_rate": 1.983254907563955e-05,
"loss": 0.058,
"reward": 2.5390625,
"reward_std": 3.450487032532692,
"rewards/accuracy_reward_staging": 0.15625,
"rewards/format_reward": 0.4765625,
"rewards/format_reward_staging": 0.5,
"step": 61
},
{
"clip_ratio": 0.0,
"completion_length": 196.15625,
"epoch": 31.0,
"grad_norm": 40355.16362771545,
"kl": 549.29296875,
"learning_rate": 1.9816271834476642e-05,
"loss": 46.2265,
"reward": 1.53125,
"reward_std": 1.8515103608369827,
"rewards/accuracy_reward_staging": 0.046875,
"rewards/format_reward": 0.46875,
"rewards/format_reward_staging": 0.59375,
"step": 62
},
{
"clip_ratio": 0.0,
"completion_length": 136.71875,
"epoch": 31.5,
"grad_norm": 13611.623138416926,
"kl": 673.318359375,
"learning_rate": 1.9799247046208297e-05,
"loss": 49.8755,
"reward": 2.5,
"reward_std": 3.039003312587738,
"rewards/accuracy_reward_staging": 0.140625,
"rewards/format_reward": 0.515625,
"rewards/format_reward_staging": 0.578125,
"step": 63
},
{
"clip_ratio": 0.0,
"completion_length": 177.1875,
"epoch": 32.0,
"grad_norm": 30.436611541482517,
"kl": 1.78515625,
"learning_rate": 1.9781476007338058e-05,
"loss": 0.2471,
"reward": 4.484375,
"reward_std": 3.2696904987096786,
"rewards/accuracy_reward_staging": 0.3125,
"rewards/format_reward": 0.703125,
"rewards/format_reward_staging": 0.65625,
"step": 64
},
{
"clip_ratio": 0.0,
"completion_length": 146.578125,
"epoch": 32.5,
"grad_norm": 53.62802825228914,
"kl": 1.798828125,
"learning_rate": 1.9762960071199334e-05,
"loss": 0.1721,
"reward": 3.953125,
"reward_std": 3.039620280265808,
"rewards/accuracy_reward_staging": 0.25,
"rewards/format_reward": 0.734375,
"rewards/format_reward_staging": 0.71875,
"step": 65
},
{
"clip_ratio": 0.0,
"completion_length": 140.65625,
"epoch": 33.0,
"grad_norm": 25.346908056620485,
"kl": 2.13671875,
"learning_rate": 1.9743700647852356e-05,
"loss": 0.1572,
"reward": 3.59375,
"reward_std": 2.9897230714559555,
"rewards/accuracy_reward_staging": 0.21875,
"rewards/format_reward": 0.6875,
"rewards/format_reward_staging": 0.71875,
"step": 66
},
{
"clip_ratio": 0.0,
"completion_length": 118.59375,
"epoch": 33.5,
"grad_norm": 27.454135887691468,
"kl": 2.25390625,
"learning_rate": 1.9723699203976768e-05,
"loss": -0.0063,
"reward": 1.875,
"reward_std": 1.3994054794311523,
"rewards/accuracy_reward_staging": 0.03125,
"rewards/format_reward": 0.765625,
"rewards/format_reward_staging": 0.796875,
"step": 67
},
{
"clip_ratio": 0.0,
"completion_length": 94.875,
"epoch": 34.0,
"grad_norm": 2.334869737314596,
"kl": 1.462890625,
"learning_rate": 1.9702957262759964e-05,
"loss": 0.0725,
"reward": 5.890625,
"reward_std": 3.7823618352413177,
"rewards/accuracy_reward_staging": 0.40625,
"rewards/format_reward": 0.9375,
"rewards/format_reward_staging": 0.890625,
"step": 68
},
{
"clip_ratio": 0.0,
"completion_length": 100.140625,
"epoch": 34.5,
"grad_norm": 15.721464309872673,
"kl": 3.056640625,
"learning_rate": 1.968147640378108e-05,
"loss": -0.0322,
"reward": 5.96875,
"reward_std": 3.234290450811386,
"rewards/accuracy_reward_staging": 0.4375,
"rewards/format_reward": 0.78125,
"rewards/format_reward_staging": 0.8125,
"step": 69
},
{
"epoch": 35.0,
"grad_norm": 57428.88495542333,
"learning_rate": 1.9659258262890683e-05,
"loss": 54.082,
"step": 70
},
{
"epoch": 35.0,
"eval_clip_ratio": 0.0,
"eval_completion_length": 96.375,
"eval_kl": 2.8583984375,
"eval_loss": 0.3439752459526062,
"eval_reward": 3.6953125,
"eval_reward_std": 2.673917531967163,
"eval_rewards/accuracy_reward_staging": 0.2109375,
"eval_rewards/format_reward": 0.7890625,
"eval_rewards/format_reward_staging": 0.796875,
"eval_runtime": 18.4421,
"eval_samples_per_second": 0.434,
"eval_steps_per_second": 0.054,
"step": 70
},
{
"clip_ratio": 0.0,
"completion_length": 107.3515625,
"epoch": 35.5,
"grad_norm": 6.629087724584887,
"kl": 291.8291015625,
"learning_rate": 1.963630453208623e-05,
"loss": 0.0924,
"reward": 3.15625,
"reward_std": 2.308130495250225,
"rewards/accuracy_reward_staging": 0.15625,
"rewards/format_reward": 0.8125,
"rewards/format_reward_staging": 0.78125,
"step": 71
},
{
"clip_ratio": 0.0,
"completion_length": 84.40625,
"epoch": 36.0,
"grad_norm": 7.593042467965846,
"kl": 3.369140625,
"learning_rate": 1.961261695938319e-05,
"loss": -0.1438,
"reward": 3.828125,
"reward_std": 2.8605447858572006,
"rewards/accuracy_reward_staging": 0.265625,
"rewards/format_reward": 0.609375,
"rewards/format_reward_staging": 0.5625,
"step": 72
},
{
"clip_ratio": 0.0,
"completion_length": 91.109375,
"epoch": 36.5,
"grad_norm": 15.85973287182394,
"kl": 2.373046875,
"learning_rate": 1.958819734868193e-05,
"loss": 0.1167,
"reward": 5.375,
"reward_std": 3.426166355609894,
"rewards/accuracy_reward_staging": 0.359375,
"rewards/format_reward": 0.890625,
"rewards/format_reward_staging": 0.890625,
"step": 73
},
{
"clip_ratio": 0.0,
"completion_length": 82.4375,
"epoch": 37.0,
"grad_norm": 2.8730940159336003,
"kl": 1.78125,
"learning_rate": 1.9563047559630356e-05,
"loss": -0.0405,
"reward": 3.078125,
"reward_std": 1.6442697197198868,
"rewards/accuracy_reward_staging": 0.125,
"rewards/format_reward": 0.96875,
"rewards/format_reward_staging": 0.859375,
"step": 74
},
{
"clip_ratio": 0.0,
"completion_length": 110.8125,
"epoch": 37.5,
"grad_norm": 2.084130828938957,
"kl": 1.671875,
"learning_rate": 1.953716950748227e-05,
"loss": 0.0182,
"reward": 4.046875,
"reward_std": 2.878495067358017,
"rewards/accuracy_reward_staging": 0.234375,
"rewards/format_reward": 0.890625,
"rewards/format_reward_staging": 0.8125,
"step": 75
},
{
"clip_ratio": 0.0,
"completion_length": 153.84375,
"epoch": 38.0,
"grad_norm": 3.91268521038018,
"kl": 1.5625,
"learning_rate": 1.9510565162951538e-05,
"loss": 0.2027,
"reward": 3.46875,
"reward_std": 2.793519899249077,
"rewards/accuracy_reward_staging": 0.171875,
"rewards/format_reward": 0.875,
"rewards/format_reward_staging": 0.875,
"step": 76
},
{
"clip_ratio": 0.0,
"completion_length": 131.390625,
"epoch": 38.5,
"grad_norm": 2.196193406623696,
"kl": 1.4453125,
"learning_rate": 1.9483236552061996e-05,
"loss": 0.1111,
"reward": 2.78125,
"reward_std": 2.516305774450302,
"rewards/accuracy_reward_staging": 0.109375,
"rewards/format_reward": 0.84375,
"rewards/format_reward_staging": 0.84375,
"step": 77
},
{
"clip_ratio": 0.0,
"completion_length": 144.90625,
"epoch": 39.0,
"grad_norm": 1.8430147295312553,
"kl": 1.384765625,
"learning_rate": 1.945518575599317e-05,
"loss": 0.112,
"reward": 3.765625,
"reward_std": 2.567844718694687,
"rewards/accuracy_reward_staging": 0.1875,
"rewards/format_reward": 0.921875,
"rewards/format_reward_staging": 0.96875,
"step": 78
},
{
"clip_ratio": 0.0,
"completion_length": 113.875,
"epoch": 39.5,
"grad_norm": 2.2632400272333797,
"kl": 1.80859375,
"learning_rate": 1.9426414910921785e-05,
"loss": 0.0539,
"reward": 2.671875,
"reward_std": 1.4072720408439636,
"rewards/accuracy_reward_staging": 0.078125,
"rewards/format_reward": 0.953125,
"rewards/format_reward_staging": 0.9375,
"step": 79
},
{
"epoch": 40.0,
"grad_norm": 13.05828881312249,
"learning_rate": 1.9396926207859085e-05,
"loss": 0.2618,
"step": 80
},
{
"epoch": 40.0,
"eval_clip_ratio": 0.0,
"eval_completion_length": 107.0625,
"eval_kl": 1.9775390625,
"eval_loss": 0.1502879410982132,
"eval_reward": 3.3671875,
"eval_reward_std": 2.961985230445862,
"eval_rewards/accuracy_reward_staging": 0.1875,
"eval_rewards/format_reward": 0.734375,
"eval_rewards/format_reward_staging": 0.7578125,
"eval_runtime": 17.3809,
"eval_samples_per_second": 0.46,
"eval_steps_per_second": 0.058,
"step": 80
},
{
"clip_ratio": 0.0,
"completion_length": 142.5,
"epoch": 40.5,
"grad_norm": 3.1754965235193864,
"kl": 1.7373046875,
"learning_rate": 1.9366721892483976e-05,
"loss": 0.1444,
"reward": 3.125,
"reward_std": 2.8966881707310677,
"rewards/accuracy_reward_staging": 0.171875,
"rewards/format_reward": 0.6875,
"rewards/format_reward_staging": 0.71875,
"step": 81
},
{
"clip_ratio": 0.0,
"completion_length": 86.5625,
"epoch": 41.0,
"grad_norm": 3.7587088555535186,
"kl": 1.984375,
"learning_rate": 1.9335804264972018e-05,
"loss": 0.2337,
"reward": 3.6875,
"reward_std": 3.980203613638878,
"rewards/accuracy_reward_staging": 0.234375,
"rewards/format_reward": 0.625,
"rewards/format_reward_staging": 0.71875,
"step": 82
},
{
"clip_ratio": 0.0,
"completion_length": 135.828125,
"epoch": 41.5,
"grad_norm": 3.975006138810092,
"kl": 2.05859375,
"learning_rate": 1.9304175679820247e-05,
"loss": 0.2953,
"reward": 3.09375,
"reward_std": 3.4910158962011337,
"rewards/accuracy_reward_staging": 0.1875,
"rewards/format_reward": 0.5625,
"rewards/format_reward_staging": 0.65625,
"step": 83
},
{
"clip_ratio": 0.0,
"completion_length": 82.1875,
"epoch": 42.0,
"grad_norm": 18.245540108535405,
"kl": 1.978515625,
"learning_rate": 1.9271838545667876e-05,
"loss": 0.1649,
"reward": 4.015625,
"reward_std": 2.856592908501625,
"rewards/accuracy_reward_staging": 0.21875,
"rewards/format_reward": 0.921875,
"rewards/format_reward_staging": 0.90625,
"step": 84
},
{
"clip_ratio": 0.0,
"completion_length": 124.890625,
"epoch": 42.5,
"grad_norm": 24.523972833745187,
"kl": 8.154296875,
"learning_rate": 1.9238795325112867e-05,
"loss": 0.2386,
"reward": 3.65625,
"reward_std": 3.7115366458892822,
"rewards/accuracy_reward_staging": 0.21875,
"rewards/format_reward": 0.71875,
"rewards/format_reward_staging": 0.75,
"step": 85
},
{
"clip_ratio": 0.0,
"completion_length": 96.125,
"epoch": 43.0,
"grad_norm": 6.021647719248748,
"kl": 1.91796875,
"learning_rate": 1.9205048534524405e-05,
"loss": 0.0759,
"reward": 2.75,
"reward_std": 1.662882000207901,
"rewards/accuracy_reward_staging": 0.125,
"rewards/format_reward": 0.71875,
"rewards/format_reward_staging": 0.78125,
"step": 86
},
{
"clip_ratio": 0.0,
"completion_length": 135.796875,
"epoch": 43.5,
"grad_norm": 11.816137935249499,
"kl": 4.1640625,
"learning_rate": 1.917060074385124e-05,
"loss": 0.223,
"reward": 1.375,
"reward_std": 1.6268048882484436,
"rewards/accuracy_reward_staging": 0.03125,
"rewards/format_reward": 0.484375,
"rewards/format_reward_staging": 0.578125,
"step": 87
},
{
"clip_ratio": 0.0,
"completion_length": 68.59375,
"epoch": 44.0,
"grad_norm": 14.46698321940086,
"kl": 1.978515625,
"learning_rate": 1.913545457642601e-05,
"loss": 0.0088,
"reward": 5.953125,
"reward_std": 2.1696823835372925,
"rewards/accuracy_reward_staging": 0.40625,
"rewards/format_reward": 0.9375,
"rewards/format_reward_staging": 0.953125,
"step": 88
},
{
"clip_ratio": 0.0,
"completion_length": 68.234375,
"epoch": 44.5,
"grad_norm": 63.367821121151934,
"kl": 57.087890625,
"learning_rate": 1.9099612708765432e-05,
"loss": 0.3075,
"reward": 4.0625,
"reward_std": 0.23853857815265656,
"rewards/accuracy_reward_staging": 0.25,
"rewards/format_reward": 0.78125,
"rewards/format_reward_staging": 0.78125,
"step": 89
},
{
"epoch": 45.0,
"grad_norm": 11.140992168821672,
"learning_rate": 1.9063077870366504e-05,
"loss": 0.0056,
"step": 90
},
{
"epoch": 45.0,
"eval_clip_ratio": 0.0,
"eval_completion_length": 53.8046875,
"eval_kl": 3.3603515625,
"eval_loss": -0.141469344496727,
"eval_reward": 3.75,
"eval_reward_std": 0.6776039004325867,
"eval_rewards/accuracy_reward_staging": 0.2265625,
"eval_rewards/format_reward": 0.7265625,
"eval_rewards/format_reward_staging": 0.7578125,
"eval_runtime": 8.6597,
"eval_samples_per_second": 0.924,
"eval_steps_per_second": 0.115,
"step": 90
},
{
"clip_ratio": 0.0,
"completion_length": 59.875,
"epoch": 45.5,
"grad_norm": 17.729188427124154,
"kl": 3.9931640625,
"learning_rate": 1.902585284349861e-05,
"loss": -0.0965,
"reward": 3.8828125,
"reward_std": 0.6357803642749786,
"rewards/accuracy_reward_staging": 0.2421875,
"rewards/format_reward": 0.71875,
"rewards/format_reward_staging": 0.7421875,
"step": 91
},
{
"clip_ratio": 0.0,
"completion_length": 51.75,
"epoch": 46.0,
"grad_norm": 6.567024017689014,
"kl": 2.939453125,
"learning_rate": 1.8987940462991673e-05,
"loss": 0.0987,
"reward": 3.796875,
"reward_std": 0.7728912532329559,
"rewards/accuracy_reward_staging": 0.234375,
"rewards/format_reward": 0.75,
"rewards/format_reward_staging": 0.703125,
"step": 92
},
{
"clip_ratio": 0.0,
"completion_length": 34.703125,
"epoch": 46.5,
"grad_norm": 2.7843419900288473,
"kl": 4.166015625,
"learning_rate": 1.894934361602025e-05,
"loss": 0.1104,
"reward": 3.5,
"reward_std": 0.0,
"rewards/accuracy_reward_staging": 0.25,
"rewards/format_reward": 0.5,
"rewards/format_reward_staging": 0.5,
"step": 93
},
{
"clip_ratio": 0.0,
"completion_length": 63.5,
"epoch": 47.0,
"grad_norm": 2.842446909471968,
"kl": 1.50390625,
"learning_rate": 1.891006524188368e-05,
"loss": 0.0614,
"reward": 4.5,
"reward_std": 0.0,
"rewards/accuracy_reward_staging": 0.25,
"rewards/format_reward": 1.0,
"rewards/format_reward_staging": 1.0,
"step": 94
},
{
"clip_ratio": 0.0,
"completion_length": 48.609375,
"epoch": 47.5,
"grad_norm": 16.935662051415687,
"kl": 3.41796875,
"learning_rate": 1.887010833178222e-05,
"loss": 0.0746,
"reward": 3.96875,
"reward_std": 0.08539125323295593,
"rewards/accuracy_reward_staging": 0.25,
"rewards/format_reward": 0.71875,
"rewards/format_reward_staging": 0.75,
"step": 95
},
{
"clip_ratio": 0.0,
"completion_length": 48.40625,
"epoch": 48.0,
"grad_norm": 65.56995634930182,
"kl": 8.68359375,
"learning_rate": 1.8829475928589272e-05,
"loss": 0.2729,
"reward": 2.96875,
"reward_std": 1.9943940043449402,
"rewards/accuracy_reward_staging": 0.15625,
"rewards/format_reward": 0.6875,
"rewards/format_reward_staging": 0.71875,
"step": 96
},
{
"clip_ratio": 0.0,
"completion_length": 53.921875,
"epoch": 48.5,
"grad_norm": 7.023283731196324,
"kl": 3.861328125,
"learning_rate": 1.8788171126619653e-05,
"loss": 0.0774,
"reward": 1.375,
"reward_std": 0.26598526537418365,
"rewards/accuracy_reward_staging": 0.0,
"rewards/format_reward": 0.6875,
"rewards/format_reward_staging": 0.6875,
"step": 97
},
{
"clip_ratio": 0.0,
"completion_length": 53.40625,
"epoch": 49.0,
"grad_norm": 5.600387532938351,
"kl": 4.03515625,
"learning_rate": 1.874619707139396e-05,
"loss": 0.1163,
"reward": 1.84375,
"reward_std": 1.9289895445108414,
"rewards/accuracy_reward_staging": 0.0625,
"rewards/format_reward": 0.609375,
"rewards/format_reward_staging": 0.609375,
"step": 98
},
{
"clip_ratio": 0.0,
"completion_length": 51.5625,
"epoch": 49.5,
"grad_norm": 546.222996523794,
"kl": 10.48046875,
"learning_rate": 1.8703556959398998e-05,
"loss": 0.2164,
"reward": 1.96875,
"reward_std": 1.679331585764885,
"rewards/accuracy_reward_staging": 0.078125,
"rewards/format_reward": 0.59375,
"rewards/format_reward_staging": 0.59375,
"step": 99
},
{
"epoch": 50.0,
"grad_norm": 6.92675967993841,
"learning_rate": 1.866025403784439e-05,
"loss": 0.0302,
"step": 100
},
{
"epoch": 50.0,
"eval_clip_ratio": 0.0,
"eval_completion_length": 53.1953125,
"eval_kl": 3.412109375,
"eval_loss": -0.0034832179080694914,
"eval_reward": 1.84375,
"eval_reward_std": 1.4304483458399773,
"eval_rewards/accuracy_reward_staging": 0.0546875,
"eval_rewards/format_reward": 0.6484375,
"eval_rewards/format_reward_staging": 0.6484375,
"eval_runtime": 8.882,
"eval_samples_per_second": 0.901,
"eval_steps_per_second": 0.113,
"step": 100
},
{
"clip_ratio": 0.0,
"completion_length": 58.1484375,
"epoch": 50.5,
"grad_norm": 42.08311553908255,
"kl": 4.25390625,
"learning_rate": 1.861629160441526e-05,
"loss": 0.1744,
"reward": 1.890625,
"reward_std": 1.2476187869906425,
"rewards/accuracy_reward_staging": 0.0546875,
"rewards/format_reward": 0.671875,
"rewards/format_reward_staging": 0.671875,
"step": 101
},
{
"clip_ratio": 0.0,
"completion_length": 57.875,
"epoch": 51.0,
"grad_norm": 14.483914466544725,
"kl": 3.54296875,
"learning_rate": 1.8571673007021124e-05,
"loss": 0.0479,
"reward": 2.515625,
"reward_std": 1.502477765083313,
"rewards/accuracy_reward_staging": 0.109375,
"rewards/format_reward": 0.71875,
"rewards/format_reward_staging": 0.703125,
"step": 102
},
{
"clip_ratio": 0.0,
"completion_length": 77.078125,
"epoch": 51.5,
"grad_norm": 3.9386280468332933,
"kl": 1.173828125,
"learning_rate": 1.8526401643540924e-05,
"loss": 0.0927,
"reward": 3.34375,
"reward_std": 1.4447221755981445,
"rewards/accuracy_reward_staging": 0.140625,
"rewards/format_reward": 0.953125,
"rewards/format_reward_staging": 0.984375,
"step": 103
},
{
"clip_ratio": 0.0,
"completion_length": 42.65625,
"epoch": 52.0,
"grad_norm": 5.22687443422562,
"kl": 3.640625,
"learning_rate": 1.848048096156426e-05,
"loss": 0.0985,
"reward": 3.1875,
"reward_std": 0.8539125919342041,
"rewards/accuracy_reward_staging": 0.21875,
"rewards/format_reward": 0.5,
"rewards/format_reward_staging": 0.5,
"step": 104
},
{
"clip_ratio": 0.0,
"completion_length": 53.59375,
"epoch": 52.5,
"grad_norm": 4.578066958828574,
"kl": 2.548828125,
"learning_rate": 1.843391445812886e-05,
"loss": 0.0016,
"reward": 3.46875,
"reward_std": 2.002212718129158,
"rewards/accuracy_reward_staging": 0.21875,
"rewards/format_reward": 0.625,
"rewards/format_reward_staging": 0.65625,
"step": 105
},
{
"clip_ratio": 0.0,
"completion_length": 57.5625,
"epoch": 53.0,
"grad_norm": 24.480999374163673,
"kl": 2.4599609375,
"learning_rate": 1.8386705679454243e-05,
"loss": -0.3046,
"reward": 3.96875,
"reward_std": 0.25,
"rewards/accuracy_reward_staging": 0.25,
"rewards/format_reward": 0.71875,
"rewards/format_reward_staging": 0.75,
"step": 106
},
{
"clip_ratio": 0.0,
"completion_length": 39.734375,
"epoch": 53.5,
"grad_norm": 9.034066251198404,
"kl": 3.109375,
"learning_rate": 1.8338858220671683e-05,
"loss": -0.1409,
"reward": 0.9375,
"reward_std": 0.29578250646591187,
"rewards/accuracy_reward_staging": 0.0,
"rewards/format_reward": 0.46875,
"rewards/format_reward_staging": 0.46875,
"step": 107
},
{
"clip_ratio": 0.0,
"completion_length": 69.5,
"epoch": 54.0,
"grad_norm": 2.4189570966395286,
"kl": 1.1376953125,
"learning_rate": 1.8290375725550417e-05,
"loss": -0.05,
"reward": 6.734375,
"reward_std": 2.646019369363785,
"rewards/accuracy_reward_staging": 0.484375,
"rewards/format_reward": 0.921875,
"rewards/format_reward_staging": 0.96875,
"step": 108
},
{
"clip_ratio": 0.0,
"completion_length": 56.0,
"epoch": 54.5,
"grad_norm": 5.916855107370096,
"kl": 2.0869140625,
"learning_rate": 1.8241261886220155e-05,
"loss": 0.0611,
"reward": 4.578125,
"reward_std": 1.265925258398056,
"rewards/accuracy_reward_staging": 0.3125,
"rewards/format_reward": 0.71875,
"rewards/format_reward_staging": 0.734375,
"step": 109
},
{
"epoch": 55.0,
"grad_norm": 10.466101735080972,
"learning_rate": 1.819152044288992e-05,
"loss": 0.0845,
"step": 110
},
{
"epoch": 55.0,
"eval_clip_ratio": 0.0,
"eval_completion_length": 54.3359375,
"eval_kl": 1.70068359375,
"eval_loss": -0.013815220445394516,
"eval_reward": 3.875,
"eval_reward_std": 1.7073951363563538,
"eval_rewards/accuracy_reward_staging": 0.25,
"eval_rewards/format_reward": 0.6875,
"eval_rewards/format_reward_staging": 0.6875,
"eval_runtime": 8.9181,
"eval_samples_per_second": 0.897,
"eval_steps_per_second": 0.112,
"step": 110
},
{
"clip_ratio": 0.0,
"completion_length": 62.453125,
"epoch": 55.5,
"grad_norm": 5.151983159259695,
"kl": 2.58154296875,
"learning_rate": 1.8141155183563195e-05,
"loss": -0.0532,
"reward": 3.125,
"reward_std": 1.16278538107872,
"rewards/accuracy_reward_staging": 0.1484375,
"rewards/format_reward": 0.8203125,
"rewards/format_reward_staging": 0.8203125,
"step": 111
},
{
"clip_ratio": 0.0,
"completion_length": 40.09375,
"epoch": 56.0,
"grad_norm": 3.725576654068748,
"kl": 2.65234375,
"learning_rate": 1.8090169943749477e-05,
"loss": 0.0112,
"reward": 5.234375,
"reward_std": 1.9529178738594055,
"rewards/accuracy_reward_staging": 0.4375,
"rewards/format_reward": 0.40625,
"rewards/format_reward_staging": 0.453125,
"step": 112
},
{
"clip_ratio": 0.0,
"completion_length": 68.09375,
"epoch": 56.5,
"grad_norm": 3.421963670561032,
"kl": 1.21875,
"learning_rate": 1.8038568606172172e-05,
"loss": -0.0952,
"reward": 4.09375,
"reward_std": 1.8953916430473328,
"rewards/accuracy_reward_staging": 0.234375,
"rewards/format_reward": 0.828125,
"rewards/format_reward_staging": 0.921875,
"step": 113
},
{
"clip_ratio": 0.0,
"completion_length": 38.9375,
"epoch": 57.0,
"grad_norm": 19.54047147726119,
"kl": 3.12890625,
"learning_rate": 1.798635510047293e-05,
"loss": 0.0866,
"reward": 3.828125,
"reward_std": 1.6875420212745667,
"rewards/accuracy_reward_staging": 0.296875,
"rewards/format_reward": 0.359375,
"rewards/format_reward_staging": 0.5,
"step": 114
},
{
"clip_ratio": 0.0,
"completion_length": 69.234375,
"epoch": 57.5,
"grad_norm": 11.266561852713883,
"kl": 2.2763671875,
"learning_rate": 1.7933533402912354e-05,
"loss": 0.0311,
"reward": 4.9375,
"reward_std": 1.8707758784294128,
"rewards/accuracy_reward_staging": 0.3125,
"rewards/format_reward": 0.84375,
"rewards/format_reward_staging": 0.96875,
"step": 115
},
{
"clip_ratio": 0.0,
"completion_length": 36.625,
"epoch": 58.0,
"grad_norm": 2.4520418407831195,
"kl": 3.1875,
"learning_rate": 1.788010753606722e-05,
"loss": 0.0328,
"reward": 3.09375,
"reward_std": 1.0208056271076202,
"rewards/accuracy_reward_staging": 0.21875,
"rewards/format_reward": 0.421875,
"rewards/format_reward_staging": 0.484375,
"step": 116
},
{
"clip_ratio": 0.0,
"completion_length": 65.109375,
"epoch": 58.5,
"grad_norm": 3.085523482612536,
"kl": 1.400390625,
"learning_rate": 1.782608156852414e-05,
"loss": -0.1165,
"reward": 3.90625,
"reward_std": 2.0603334307670593,
"rewards/accuracy_reward_staging": 0.21875,
"rewards/format_reward": 0.796875,
"rewards/format_reward_staging": 0.921875,
"step": 117
},
{
"clip_ratio": 0.0,
"completion_length": 29.5625,
"epoch": 59.0,
"grad_norm": 10.594589302032892,
"kl": 2.95703125,
"learning_rate": 1.777145961456971e-05,
"loss": 0.0282,
"reward": 3.1875,
"reward_std": 1.6452402472496033,
"rewards/accuracy_reward_staging": 0.234375,
"rewards/format_reward": 0.375,
"rewards/format_reward_staging": 0.46875,
"step": 118
},
{
"clip_ratio": 0.0,
"completion_length": 37.84375,
"epoch": 59.5,
"grad_norm": 4.104628944341923,
"kl": 2.490234375,
"learning_rate": 1.7716245833877202e-05,
"loss": 0.0598,
"reward": 1.109375,
"reward_std": 0.6517204642295837,
"rewards/accuracy_reward_staging": 0.015625,
"rewards/format_reward": 0.46875,
"rewards/format_reward_staging": 0.484375,
"step": 119
},
{
"epoch": 60.0,
"grad_norm": 2.7051939689212157,
"learning_rate": 1.766044443118978e-05,
"loss": -0.0924,
"step": 120
},
{
"epoch": 60.0,
"eval_clip_ratio": 0.0,
"eval_completion_length": 49.0859375,
"eval_kl": 2.880859375,
"eval_loss": -0.00882991123944521,
"eval_reward": 4.03125,
"eval_reward_std": 1.9904271215200424,
"eval_rewards/accuracy_reward_staging": 0.265625,
"eval_rewards/format_reward": 0.6640625,
"eval_rewards/format_reward_staging": 0.7109375,
"eval_runtime": 7.683,
"eval_samples_per_second": 1.041,
"eval_steps_per_second": 0.13,
"step": 120
},
{
"clip_ratio": 0.0,
"completion_length": 54.3359375,
"epoch": 60.5,
"grad_norm": 8.217105476776826,
"kl": 1.54443359375,
"learning_rate": 1.7604059656000313e-05,
"loss": 0.0161,
"reward": 5.5,
"reward_std": 1.8617027252912521,
"rewards/accuracy_reward_staging": 0.390625,
"rewards/format_reward": 0.7421875,
"rewards/format_reward_staging": 0.8515625,
"step": 121
},
{
"clip_ratio": 0.0,
"completion_length": 48.21875,
"epoch": 61.0,
"grad_norm": 8.76714381411277,
"kl": 2.35546875,
"learning_rate": 1.7547095802227723e-05,
"loss": 0.0302,
"reward": 2.8125,
"reward_std": 1.4936581254005432,
"rewards/accuracy_reward_staging": 0.140625,
"rewards/format_reward": 0.703125,
"rewards/format_reward_staging": 0.703125,
"step": 122
},
{
"clip_ratio": 0.0,
"completion_length": 35.328125,
"epoch": 61.5,
"grad_norm": 28.267166519601638,
"kl": 6.83203125,
"learning_rate": 1.7489557207890025e-05,
"loss": 0.0964,
"reward": 3.3125,
"reward_std": 0.75,
"rewards/accuracy_reward_staging": 0.234375,
"rewards/format_reward": 0.484375,
"rewards/format_reward_staging": 0.484375,
"step": 123
},
{
"clip_ratio": 0.0,
"completion_length": 60.15625,
"epoch": 62.0,
"grad_norm": 6.154728910682446,
"kl": 2.5,
"learning_rate": 1.7431448254773943e-05,
"loss": -0.0487,
"reward": 3.421875,
"reward_std": 2.588469222187996,
"rewards/accuracy_reward_staging": 0.15625,
"rewards/format_reward": 0.921875,
"rewards/format_reward_staging": 0.9375,
"step": 124
},
{
"clip_ratio": 0.0,
"completion_length": 50.28125,
"epoch": 62.5,
"grad_norm": 6.026022088929383,
"kl": 2.130859375,
"learning_rate": 1.737277336810124e-05,
"loss": 0.0602,
"reward": 2.40625,
"reward_std": 2.038558602333069,
"rewards/accuracy_reward_staging": 0.09375,
"rewards/format_reward": 0.734375,
"rewards/format_reward_staging": 0.734375,
"step": 125
},
{
"clip_ratio": 0.0,
"completion_length": 50.28125,
"epoch": 63.0,
"grad_norm": 1.6439981557613697,
"kl": 1.34765625,
"learning_rate": 1.7313537016191706e-05,
"loss": 0.0648,
"reward": 3.0625,
"reward_std": 1.25,
"rewards/accuracy_reward_staging": 0.15625,
"rewards/format_reward": 0.75,
"rewards/format_reward_staging": 0.75,
"step": 126
},
{
"clip_ratio": 0.0,
"completion_length": 63.234375,
"epoch": 63.5,
"grad_norm": 3.1039400606839846,
"kl": 1.615234375,
"learning_rate": 1.7253743710122877e-05,
"loss": -0.0558,
"reward": 4.203125,
"reward_std": 2.4739063382148743,
"rewards/accuracy_reward_staging": 0.234375,
"rewards/format_reward": 0.9375,
"rewards/format_reward_staging": 0.921875,
"step": 127
},
{
"clip_ratio": 0.0,
"completion_length": 38.40625,
"epoch": 64.0,
"grad_norm": 3.9232383422378714,
"kl": 2.087890625,
"learning_rate": 1.7193398003386514e-05,
"loss": 0.0734,
"reward": 1.140625,
"reward_std": 0.6875,
"rewards/accuracy_reward_staging": 0.015625,
"rewards/format_reward": 0.484375,
"rewards/format_reward_staging": 0.5,
"step": 128
},
{
"clip_ratio": 0.0,
"completion_length": 72.671875,
"epoch": 64.5,
"grad_norm": 3.018724764911343,
"kl": 1.263671875,
"learning_rate": 1.713250449154182e-05,
"loss": 0.0133,
"reward": 4.75,
"reward_std": 2.8038886189460754,
"rewards/accuracy_reward_staging": 0.28125,
"rewards/format_reward": 0.953125,
"rewards/format_reward_staging": 0.984375,
"step": 129
},
{
"epoch": 65.0,
"grad_norm": 3.0081574444150787,
"learning_rate": 1.7071067811865477e-05,
"loss": 0.0217,
"step": 130
},
{
"epoch": 65.0,
"eval_clip_ratio": 0.0,
"eval_completion_length": 71.6171875,
"eval_kl": 1.7802734375,
"eval_loss": 0.047457288950681686,
"eval_reward": 3.8203125,
"eval_reward_std": 1.6125783324241638,
"eval_rewards/accuracy_reward_staging": 0.234375,
"eval_rewards/format_reward": 0.7265625,
"eval_rewards/format_reward_staging": 0.75,
"eval_runtime": 9.8391,
"eval_samples_per_second": 0.813,
"eval_steps_per_second": 0.102,
"step": 130
},
{
"clip_ratio": 0.0,
"completion_length": 44.1796875,
"epoch": 65.5,
"grad_norm": 87.36549568538945,
"kl": 5.3486328125,
"learning_rate": 1.700909264299851e-05,
"loss": 0.2066,
"reward": 2.609375,
"reward_std": 1.7113949656486511,
"rewards/accuracy_reward_staging": 0.1640625,
"rewards/format_reward": 0.4765625,
"rewards/format_reward_staging": 0.4921875,
"step": 131
},
{
"clip_ratio": 0.0,
"completion_length": 100.0,
"epoch": 66.0,
"grad_norm": 8.157221542562647,
"kl": 1.572265625,
"learning_rate": 1.6946583704589973e-05,
"loss": -0.028,
"reward": 4.15625,
"reward_std": 0.9789125919342041,
"rewards/accuracy_reward_staging": 0.21875,
"rewards/format_reward": 0.96875,
"rewards/format_reward_staging": 1.0,
"step": 132
},
{
"clip_ratio": 0.0,
"completion_length": 57.234375,
"epoch": 66.5,
"grad_norm": 11.318664763937058,
"kl": 1.568359375,
"learning_rate": 1.688354575693754e-05,
"loss": 0.0296,
"reward": 3.65625,
"reward_std": 2.0427924394607544,
"rewards/accuracy_reward_staging": 0.21875,
"rewards/format_reward": 0.71875,
"rewards/format_reward_staging": 0.75,
"step": 133
},
{
"clip_ratio": 0.0,
"completion_length": 48.5,
"epoch": 67.0,
"grad_norm": 4.231087003668829,
"kl": 1.572265625,
"learning_rate": 1.6819983600624986e-05,
"loss": -0.0374,
"reward": 6.015625,
"reward_std": 1.1542446613311768,
"rewards/accuracy_reward_staging": 0.46875,
"rewards/format_reward": 0.640625,
"rewards/format_reward_staging": 0.6875,
"step": 134
},
{
"clip_ratio": 0.0,
"completion_length": 49.75,
"epoch": 67.5,
"grad_norm": 5.475126751687303,
"kl": 1.1953125,
"learning_rate": 1.6755902076156606e-05,
"loss": -0.0876,
"reward": 3.875,
"reward_std": 0.32214587926864624,
"rewards/accuracy_reward_staging": 0.25,
"rewards/format_reward": 0.640625,
"rewards/format_reward_staging": 0.734375,
"step": 135
},
{
"clip_ratio": 0.0,
"completion_length": 49.03125,
"epoch": 68.0,
"grad_norm": 3.1878976206293186,
"kl": 1.29296875,
"learning_rate": 1.6691306063588583e-05,
"loss": 0.039,
"reward": 4.390625,
"reward_std": 2.472952723503113,
"rewards/accuracy_reward_staging": 0.296875,
"rewards/format_reward": 0.6875,
"rewards/format_reward_staging": 0.734375,
"step": 136
},
{
"clip_ratio": 0.0,
"completion_length": 43.78125,
"epoch": 68.5,
"grad_norm": 5.889272400451515,
"kl": 2.4609375,
"learning_rate": 1.6626200482157378e-05,
"loss": -0.077,
"reward": 3.515625,
"reward_std": 1.154121845960617,
"rewards/accuracy_reward_staging": 0.21875,
"rewards/format_reward": 0.609375,
"rewards/format_reward_staging": 0.71875,
"step": 137
},
{
"clip_ratio": 0.0,
"completion_length": 45.75,
"epoch": 69.0,
"grad_norm": 3.7471899234962898,
"kl": 1.05859375,
"learning_rate": 1.6560590289905074e-05,
"loss": 0.0093,
"reward": 4.890625,
"reward_std": 2.029541850090027,
"rewards/accuracy_reward_staging": 0.34375,
"rewards/format_reward": 0.71875,
"rewards/format_reward_staging": 0.734375,
"step": 138
},
{
"clip_ratio": 0.0,
"completion_length": 29.78125,
"epoch": 69.5,
"grad_norm": 4.907092274694152,
"kl": 1.103515625,
"learning_rate": 1.6494480483301836e-05,
"loss": 0.024,
"reward": 4.6875,
"reward_std": 2.245893716812134,
"rewards/accuracy_reward_staging": 0.375,
"rewards/format_reward": 0.453125,
"rewards/format_reward_staging": 0.484375,
"step": 139
},
{
"epoch": 70.0,
"grad_norm": 6.180078269058641,
"learning_rate": 1.6427876096865394e-05,
"loss": -0.0735,
"step": 140
},
{
"epoch": 70.0,
"eval_clip_ratio": 0.0,
"eval_completion_length": 45.1171875,
"eval_kl": 1.2315673828125,
"eval_loss": -0.0003372877836227417,
"eval_reward": 4.1484375,
"eval_reward_std": 1.8942626863718033,
"eval_rewards/accuracy_reward_staging": 0.2734375,
"eval_rewards/format_reward": 0.7109375,
"eval_rewards/format_reward_staging": 0.703125,
"eval_runtime": 7.3533,
"eval_samples_per_second": 1.088,
"eval_steps_per_second": 0.136,
"step": 140
},
{
"clip_ratio": 0.0,
"completion_length": 57.609375,
"epoch": 70.5,
"grad_norm": 3.7369372987659357,
"kl": 1.7197265625,
"learning_rate": 1.636078220277764e-05,
"loss": -0.0225,
"reward": 4.7265625,
"reward_std": 2.5463077425956726,
"rewards/accuracy_reward_staging": 0.2890625,
"rewards/format_reward": 0.890625,
"rewards/format_reward_staging": 0.9453125,
"step": 141
},
{
"clip_ratio": 0.0,
"completion_length": 41.0,
"epoch": 71.0,
"grad_norm": 7.580877395639736,
"kl": 2.69921875,
"learning_rate": 1.6293203910498375e-05,
"loss": 0.0622,
"reward": 2.546875,
"reward_std": 1.2390142679214478,
"rewards/accuracy_reward_staging": 0.15625,
"rewards/format_reward": 0.484375,
"rewards/format_reward_staging": 0.5,
"step": 142
},
{
"clip_ratio": 0.0,
"completion_length": 46.046875,
"epoch": 71.5,
"grad_norm": 6.018587414833117,
"kl": 1.54296875,
"learning_rate": 1.6225146366376198e-05,
"loss": -0.0145,
"reward": 7.125,
"reward_std": 3.5123836994171143,
"rewards/accuracy_reward_staging": 0.578125,
"rewards/format_reward": 0.625,
"rewards/format_reward_staging": 0.71875,
"step": 143
},
{
"clip_ratio": 0.0,
"completion_length": 44.09375,
"epoch": 72.0,
"grad_norm": 12.708930539177073,
"kl": 2.927734375,
"learning_rate": 1.6156614753256583e-05,
"loss": 0.0341,
"reward": 1.40625,
"reward_std": 1.3823386430740356,
"rewards/accuracy_reward_staging": 0.046875,
"rewards/format_reward": 0.296875,
"rewards/format_reward_staging": 0.640625,
"step": 144
},
{
"clip_ratio": 0.0,
"completion_length": 41.6875,
"epoch": 72.5,
"grad_norm": 26.335774506376318,
"kl": 2.484375,
"learning_rate": 1.608761429008721e-05,
"loss": 0.0802,
"reward": 4.0,
"reward_std": 2.5394824892282486,
"rewards/accuracy_reward_staging": 0.296875,
"rewards/format_reward": 0.34375,
"rewards/format_reward_staging": 0.6875,
"step": 145
},
{
"clip_ratio": 0.0,
"completion_length": 43.0,
"epoch": 73.0,
"grad_norm": 10.027812774226838,
"kl": 2.0234375,
"learning_rate": 1.6018150231520486e-05,
"loss": -0.0637,
"reward": 5.078125,
"reward_std": 2.6153001338243484,
"rewards/accuracy_reward_staging": 0.390625,
"rewards/format_reward": 0.515625,
"rewards/format_reward_staging": 0.65625,
"step": 146
},
{
"clip_ratio": 0.0,
"completion_length": 60.21875,
"epoch": 73.5,
"grad_norm": 7.224717472496968,
"kl": 2.08203125,
"learning_rate": 1.5948227867513416e-05,
"loss": -0.0085,
"reward": 6.375,
"reward_std": 1.8922232389450073,
"rewards/accuracy_reward_staging": 0.453125,
"rewards/format_reward": 0.875,
"rewards/format_reward_staging": 0.96875,
"step": 147
},
{
"clip_ratio": 0.0,
"completion_length": 28.8125,
"epoch": 74.0,
"grad_norm": 3.90403458561103,
"kl": 1.033203125,
"learning_rate": 1.5877852522924733e-05,
"loss": -0.0176,
"reward": 4.671875,
"reward_std": 2.339739680290222,
"rewards/accuracy_reward_staging": 0.375,
"rewards/format_reward": 0.46875,
"rewards/format_reward_staging": 0.453125,
"step": 148
},
{
"clip_ratio": 0.0,
"completion_length": 56.609375,
"epoch": 74.5,
"grad_norm": 5.8341448086549965,
"kl": 1.45703125,
"learning_rate": 1.5807029557109398e-05,
"loss": -0.0544,
"reward": 5.734375,
"reward_std": 2.3053803741931915,
"rewards/accuracy_reward_staging": 0.390625,
"rewards/format_reward": 0.90625,
"rewards/format_reward_staging": 0.921875,
"step": 149
},
{
"epoch": 75.0,
"grad_norm": 4.711908468937371,
"learning_rate": 1.573576436351046e-05,
"loss": 0.0152,
"step": 150
},
{
"epoch": 75.0,
"eval_clip_ratio": 0.0,
"eval_completion_length": 45.28125,
"eval_kl": 1.6767578125,
"eval_loss": -0.009632267989218235,
"eval_reward": 3.78125,
"eval_reward_std": 0.855195626616478,
"eval_rewards/accuracy_reward_staging": 0.234375,
"eval_rewards/format_reward": 0.7109375,
"eval_rewards/format_reward_staging": 0.7265625,
"eval_runtime": 7.9037,
"eval_samples_per_second": 1.012,
"eval_steps_per_second": 0.127,
"step": 150
},
{
"clip_ratio": 0.0,
"completion_length": 38.3984375,
"epoch": 75.5,
"grad_norm": 29.133371779710487,
"kl": 1.69482421875,
"learning_rate": 1.566406236924833e-05,
"loss": 0.038,
"reward": 3.4609375,
"reward_std": 0.8732095211744308,
"rewards/accuracy_reward_staging": 0.2265625,
"rewards/format_reward": 0.5859375,
"rewards/format_reward_staging": 0.609375,
"step": 151
},
{
"clip_ratio": 0.0,
"completion_length": 45.03125,
"epoch": 76.0,
"grad_norm": 71.16435014998271,
"kl": 1.560546875,
"learning_rate": 1.5591929034707468e-05,
"loss": 0.0866,
"reward": 1.4375,
"reward_std": 0.14433756470680237,
"rewards/accuracy_reward_staging": 0.0,
"rewards/format_reward": 0.734375,
"rewards/format_reward_staging": 0.703125,
"step": 152
},
{
"clip_ratio": 0.0,
"completion_length": 44.6875,
"epoch": 76.5,
"grad_norm": 1.0512585294331593,
"kl": 0.98028564453125,
"learning_rate": 1.5519369853120584e-05,
"loss": 0.0315,
"reward": 3.984375,
"reward_std": 0.0625,
"rewards/accuracy_reward_staging": 0.25,
"rewards/format_reward": 0.734375,
"rewards/format_reward_staging": 0.75,
"step": 153
},
{
"clip_ratio": 0.0,
"completion_length": 43.375,
"epoch": 77.0,
"grad_norm": 4.22052916794067,
"kl": 1.216796875,
"learning_rate": 1.5446390350150272e-05,
"loss": -0.0304,
"reward": 3.125,
"reward_std": 1.421726554632187,
"rewards/accuracy_reward_staging": 0.171875,
"rewards/format_reward": 0.671875,
"rewards/format_reward_staging": 0.734375,
"step": 154
},
{
"clip_ratio": 0.0,
"completion_length": 43.0625,
"epoch": 77.5,
"grad_norm": 3.803510821246787,
"kl": 1.2509765625,
"learning_rate": 1.5372996083468242e-05,
"loss": 0.0057,
"reward": 3.75,
"reward_std": 0.7565859854221344,
"rewards/accuracy_reward_staging": 0.234375,
"rewards/format_reward": 0.671875,
"rewards/format_reward_staging": 0.734375,
"step": 155
},
{
"clip_ratio": 0.0,
"completion_length": 43.6875,
"epoch": 78.0,
"grad_norm": 1.6103062564934338,
"kl": 1.2275390625,
"learning_rate": 1.529919264233205e-05,
"loss": 0.0116,
"reward": 3.484375,
"reward_std": 1.0873424708843231,
"rewards/accuracy_reward_staging": 0.203125,
"rewards/format_reward": 0.703125,
"rewards/format_reward_staging": 0.75,
"step": 156
},
{
"clip_ratio": 0.0,
"completion_length": 46.46875,
"epoch": 78.5,
"grad_norm": 3.73947644081386,
"kl": 1.92578125,
"learning_rate": 1.5224985647159489e-05,
"loss": -0.0074,
"reward": 1.421875,
"reward_std": 0.17430339753627777,
"rewards/accuracy_reward_staging": 0.0,
"rewards/format_reward": 0.671875,
"rewards/format_reward_staging": 0.75,
"step": 157
},
{
"clip_ratio": 0.0,
"completion_length": 48.8125,
"epoch": 79.0,
"grad_norm": 168.19806472829526,
"kl": 4.16015625,
"learning_rate": 1.5150380749100545e-05,
"loss": 0.185,
"reward": 6.15625,
"reward_std": 1.3114574551582336,
"rewards/accuracy_reward_staging": 0.46875,
"rewards/format_reward": 0.71875,
"rewards/format_reward_staging": 0.75,
"step": 158
},
{
"clip_ratio": 0.0,
"completion_length": 74.703125,
"epoch": 79.5,
"grad_norm": 8.124079093394728,
"kl": 3.5859375,
"learning_rate": 1.5075383629607043e-05,
"loss": 0.0754,
"reward": 4.140625,
"reward_std": 0.8125,
"rewards/accuracy_reward_staging": 0.265625,
"rewards/format_reward": 0.734375,
"rewards/format_reward_staging": 0.75,
"step": 159
},
{
"epoch": 80.0,
"grad_norm": 24.584018478630828,
"learning_rate": 1.5000000000000002e-05,
"loss": 0.0492,
"step": 160
},
{
"epoch": 80.0,
"eval_clip_ratio": 0.0,
"eval_completion_length": 48.65625,
"eval_kl": 2.4384765625,
"eval_loss": 0.05181257054209709,
"eval_reward": 3.734375,
"eval_reward_std": 0.824847549200058,
"eval_rewards/accuracy_reward_staging": 0.2265625,
"eval_rewards/format_reward": 0.734375,
"eval_rewards/format_reward_staging": 0.734375,
"eval_runtime": 7.9932,
"eval_samples_per_second": 1.001,
"eval_steps_per_second": 0.125,
"step": 160
},
{
"clip_ratio": 0.0,
"completion_length": 76.4921875,
"epoch": 80.5,
"grad_norm": 6.67934298891478,
"kl": 4.8759765625,
"learning_rate": 1.4924235601034673e-05,
"loss": -0.0417,
"reward": 2.34375,
"reward_std": 0.5171605423092842,
"rewards/accuracy_reward_staging": 0.1171875,
"rewards/format_reward": 0.6015625,
"rewards/format_reward_staging": 0.5703125,
"step": 161
},
{
"clip_ratio": 0.0,
"completion_length": 54.84375,
"epoch": 81.0,
"grad_norm": 3.819874734506586,
"kl": 1.767578125,
"learning_rate": 1.4848096202463373e-05,
"loss": -0.0168,
"reward": 6.125,
"reward_std": 1.4207825064659119,
"rewards/accuracy_reward_staging": 0.421875,
"rewards/format_reward": 0.953125,
"rewards/format_reward_staging": 0.953125,
"step": 162
},
{
"clip_ratio": 0.0,
"completion_length": 98.859375,
"epoch": 81.5,
"grad_norm": 3.0908583823850546,
"kl": 1.6953125,
"learning_rate": 1.4771587602596085e-05,
"loss": 0.0503,
"reward": 3.515625,
"reward_std": 1.2676234245300293,
"rewards/accuracy_reward_staging": 0.203125,
"rewards/format_reward": 0.75,
"rewards/format_reward_staging": 0.734375,
"step": 163
},
{
"clip_ratio": 0.0,
"completion_length": 84.71875,
"epoch": 82.0,
"grad_norm": 13.65817282343041,
"kl": 3.357421875,
"learning_rate": 1.469471562785891e-05,
"loss": 0.0879,
"reward": 3.984375,
"reward_std": 1.3739574551582336,
"rewards/accuracy_reward_staging": 0.25,
"rewards/format_reward": 0.71875,
"rewards/format_reward_staging": 0.765625,
"step": 164
},
{
"clip_ratio": 0.0,
"completion_length": 56.1875,
"epoch": 82.5,
"grad_norm": 9.321438696837953,
"kl": 2.984375,
"learning_rate": 1.4617486132350343e-05,
"loss": 0.0382,
"reward": 6.53125,
"reward_std": 1.5894616693258286,
"rewards/accuracy_reward_staging": 0.46875,
"rewards/format_reward": 0.921875,
"rewards/format_reward_staging": 0.921875,
"step": 165
},
{
"clip_ratio": 0.0,
"completion_length": 117.40625,
"epoch": 83.0,
"grad_norm": 5.820169059348112,
"kl": 2.486328125,
"learning_rate": 1.4539904997395468e-05,
"loss": 0.046,
"reward": 0.984375,
"reward_std": 0.0625,
"rewards/accuracy_reward_staging": 0.0,
"rewards/format_reward": 0.5,
"rewards/format_reward_staging": 0.484375,
"step": 166
},
{
"clip_ratio": 0.0,
"completion_length": 56.96875,
"epoch": 83.5,
"grad_norm": 3.3619746674675457,
"kl": 1.662109375,
"learning_rate": 1.4461978131098089e-05,
"loss": 0.0254,
"reward": 3.953125,
"reward_std": 0.1875,
"rewards/accuracy_reward_staging": 0.25,
"rewards/format_reward": 0.71875,
"rewards/format_reward_staging": 0.734375,
"step": 167
},
{
"clip_ratio": 0.0,
"completion_length": 69.4375,
"epoch": 84.0,
"grad_norm": 4.984323181253807,
"kl": 2.224609375,
"learning_rate": 1.4383711467890776e-05,
"loss": -0.0085,
"reward": 3.5,
"reward_std": 1.2682048827409744,
"rewards/accuracy_reward_staging": 0.21875,
"rewards/format_reward": 0.640625,
"rewards/format_reward_staging": 0.671875,
"step": 168
},
{
"clip_ratio": 0.0,
"completion_length": 53.859375,
"epoch": 84.5,
"grad_norm": 10.010983408872603,
"kl": 2.3125,
"learning_rate": 1.4305110968082953e-05,
"loss": -0.028,
"reward": 3.75,
"reward_std": 2.024695038795471,
"rewards/accuracy_reward_staging": 0.234375,
"rewards/format_reward": 0.703125,
"rewards/format_reward_staging": 0.703125,
"step": 169
},
{
"epoch": 85.0,
"grad_norm": 4.824303447060251,
"learning_rate": 1.4226182617406996e-05,
"loss": 0.0597,
"step": 170
},
{
"epoch": 85.0,
"eval_clip_ratio": 0.0,
"eval_completion_length": 70.4140625,
"eval_kl": 2.732421875,
"eval_loss": 0.05451072007417679,
"eval_reward": 3.640625,
"eval_reward_std": 0.930374264717102,
"eval_rewards/accuracy_reward_staging": 0.21875,
"eval_rewards/format_reward": 0.7265625,
"eval_rewards/format_reward_staging": 0.7265625,
"eval_runtime": 13.695,
"eval_samples_per_second": 0.584,
"eval_steps_per_second": 0.073,
"step": 170
},
{
"clip_ratio": 0.0,
"completion_length": 89.2734375,
"epoch": 85.5,
"grad_norm": 19.993154281622356,
"kl": 2.4169921875,
"learning_rate": 1.4146932426562391e-05,
"loss": 0.0789,
"reward": 3.4296875,
"reward_std": 0.9237357676029205,
"rewards/accuracy_reward_staging": 0.2265625,
"rewards/format_reward": 0.5625,
"rewards/format_reward_staging": 0.6015625,
"step": 171
},
{
"clip_ratio": 0.0,
"completion_length": 58.90625,
"epoch": 86.0,
"grad_norm": 5.701163289666223,
"kl": 1.955078125,
"learning_rate": 1.4067366430758004e-05,
"loss": 0.0564,
"reward": 4.4375,
"reward_std": 0.21039125323295593,
"rewards/accuracy_reward_staging": 0.25,
"rewards/format_reward": 0.953125,
"rewards/format_reward_staging": 0.984375,
"step": 172
},
{
"clip_ratio": 0.0,
"completion_length": 104.9375,
"epoch": 86.5,
"grad_norm": 8.75300625649785,
"kl": 1.7421875,
"learning_rate": 1.3987490689252463e-05,
"loss": 0.0637,
"reward": 1.4375,
"reward_std": 0.21039125323295593,
"rewards/accuracy_reward_staging": 0.0,
"rewards/format_reward": 0.703125,
"rewards/format_reward_staging": 0.734375,
"step": 173
},
{
"clip_ratio": 0.0,
"completion_length": 62.90625,
"epoch": 87.0,
"grad_norm": 62.159358500619824,
"kl": 23.37890625,
"learning_rate": 1.3907311284892737e-05,
"loss": 0.1646,
"reward": 5.515625,
"reward_std": 2.2276171147823334,
"rewards/accuracy_reward_staging": 0.421875,
"rewards/format_reward": 0.640625,
"rewards/format_reward_staging": 0.65625,
"step": 174
},
{
"clip_ratio": 0.0,
"completion_length": 102.90625,
"epoch": 87.5,
"grad_norm": 12.112575261726283,
"kl": 3.974609375,
"learning_rate": 1.3826834323650899e-05,
"loss": 0.0835,
"reward": 3.9375,
"reward_std": 1.6707825064659119,
"rewards/accuracy_reward_staging": 0.25,
"rewards/format_reward": 0.71875,
"rewards/format_reward_staging": 0.71875,
"step": 175
},
{
"clip_ratio": 0.0,
"completion_length": 139.6875,
"epoch": 88.0,
"grad_norm": 35.23828311935734,
"kl": 7.0,
"learning_rate": 1.3746065934159123e-05,
"loss": 0.2503,
"reward": 3.703125,
"reward_std": 1.1275950223207474,
"rewards/accuracy_reward_staging": 0.234375,
"rewards/format_reward": 0.6875,
"rewards/format_reward_staging": 0.671875,
"step": 176
},
{
"clip_ratio": 0.0,
"completion_length": 73.734375,
"epoch": 88.5,
"grad_norm": 34.69838372391708,
"kl": 6.234375,
"learning_rate": 1.3665012267242974e-05,
"loss": 0.3016,
"reward": 3.75,
"reward_std": 1.6523646861314774,
"rewards/accuracy_reward_staging": 0.203125,
"rewards/format_reward": 0.875,
"rewards/format_reward_staging": 0.84375,
"step": 177
},
{
"clip_ratio": 0.0,
"completion_length": 247.34375,
"epoch": 89.0,
"grad_norm": 8.743276590821418,
"kl": 1.91796875,
"learning_rate": 1.3583679495453e-05,
"loss": 0.1189,
"reward": 2.765625,
"reward_std": 1.4041407108306885,
"rewards/accuracy_reward_staging": 0.1875,
"rewards/format_reward": 0.453125,
"rewards/format_reward_staging": 0.4375,
"step": 178
},
{
"clip_ratio": 0.0,
"completion_length": 185.65625,
"epoch": 89.5,
"grad_norm": 64.71970490888229,
"kl": 10.216796875,
"learning_rate": 1.3502073812594677e-05,
"loss": 0.2277,
"reward": 1.28125,
"reward_std": 0.48439764976501465,
"rewards/accuracy_reward_staging": 0.0,
"rewards/format_reward": 0.671875,
"rewards/format_reward_staging": 0.609375,
"step": 179
},
{
"epoch": 90.0,
"grad_norm": 641.7646960533058,
"learning_rate": 1.342020143325669e-05,
"loss": 1.3646,
"step": 180
},
{
"epoch": 90.0,
"eval_clip_ratio": 0.0,
"eval_completion_length": 180.421875,
"eval_kl": 4.01953125,
"eval_loss": 0.2227374017238617,
"eval_reward": 2.875,
"eval_reward_std": 2.4358191564679146,
"eval_rewards/accuracy_reward_staging": 0.171875,
"eval_rewards/format_reward": 0.5859375,
"eval_rewards/format_reward_staging": 0.5703125,
"eval_runtime": 21.5649,
"eval_samples_per_second": 0.371,
"eval_steps_per_second": 0.046,
"step": 180
},
{
"clip_ratio": 0.0,
"completion_length": 163.03125,
"epoch": 90.5,
"grad_norm": 28.433179154739406,
"kl": 26.9296875,
"learning_rate": 1.333806859233771e-05,
"loss": 0.2116,
"reward": 3.984375,
"reward_std": 2.283327080309391,
"rewards/accuracy_reward_staging": 0.2734375,
"rewards/format_reward": 0.6484375,
"rewards/format_reward_staging": 0.6015625,
"step": 181
},
{
"clip_ratio": 0.0,
"completion_length": 197.5625,
"epoch": 91.0,
"grad_norm": 35.69313057863262,
"kl": 9.8515625,
"learning_rate": 1.3255681544571568e-05,
"loss": 0.5447,
"reward": 3.21875,
"reward_std": 2.838429868221283,
"rewards/accuracy_reward_staging": 0.21875,
"rewards/format_reward": 0.53125,
"rewards/format_reward_staging": 0.5,
"step": 182
},
{
"clip_ratio": 0.0,
"completion_length": 121.59375,
"epoch": 91.5,
"grad_norm": 22.53366341132682,
"kl": 4.31640625,
"learning_rate": 1.3173046564050923e-05,
"loss": 0.2777,
"reward": 2.71875,
"reward_std": 2.1865703761577606,
"rewards/accuracy_reward_staging": 0.15625,
"rewards/format_reward": 0.578125,
"rewards/format_reward_staging": 0.578125,
"step": 183
},
{
"clip_ratio": 0.0,
"completion_length": 70.78125,
"epoch": 92.0,
"grad_norm": 23.102470088426298,
"kl": 3.43359375,
"learning_rate": 1.3090169943749475e-05,
"loss": 0.2916,
"reward": 3.484375,
"reward_std": 3.165164679288864,
"rewards/accuracy_reward_staging": 0.234375,
"rewards/format_reward": 0.59375,
"rewards/format_reward_staging": 0.546875,
"step": 184
},
{
"clip_ratio": 0.0,
"completion_length": 85.625,
"epoch": 92.5,
"grad_norm": 17.631303386576295,
"kl": 2.8828125,
"learning_rate": 1.300705799504273e-05,
"loss": 0.0945,
"reward": 3.1875,
"reward_std": 1.7888548523187637,
"rewards/accuracy_reward_staging": 0.1875,
"rewards/format_reward": 0.6875,
"rewards/format_reward_staging": 0.625,
"step": 185
},
{
"clip_ratio": 0.0,
"completion_length": 60.96875,
"epoch": 93.0,
"grad_norm": 17.17673811558118,
"kl": 3.25390625,
"learning_rate": 1.2923717047227368e-05,
"loss": 0.0913,
"reward": 3.890625,
"reward_std": 3.1874619126319885,
"rewards/accuracy_reward_staging": 0.265625,
"rewards/format_reward": 0.640625,
"rewards/format_reward_staging": 0.59375,
"step": 186
},
{
"clip_ratio": 0.0,
"completion_length": 62.6875,
"epoch": 93.5,
"grad_norm": 21.78344941308155,
"kl": 3.767578125,
"learning_rate": 1.284015344703923e-05,
"loss": 0.1005,
"reward": 3.875,
"reward_std": 2.966622516512871,
"rewards/accuracy_reward_staging": 0.25,
"rewards/format_reward": 0.71875,
"rewards/format_reward_staging": 0.65625,
"step": 187
},
{
"clip_ratio": 0.0,
"completion_length": 74.4375,
"epoch": 94.0,
"grad_norm": 123.72852412016174,
"kl": 13.330078125,
"learning_rate": 1.2756373558169992e-05,
"loss": 0.3738,
"reward": 3.796875,
"reward_std": 1.500662550330162,
"rewards/accuracy_reward_staging": 0.21875,
"rewards/format_reward": 0.8125,
"rewards/format_reward_staging": 0.796875,
"step": 188
},
{
"clip_ratio": 0.0,
"completion_length": 61.109375,
"epoch": 94.5,
"grad_norm": 15.826582743437257,
"kl": 2.990234375,
"learning_rate": 1.267238376078257e-05,
"loss": 0.0504,
"reward": 6.734375,
"reward_std": 4.132839202880859,
"rewards/accuracy_reward_staging": 0.515625,
"rewards/format_reward": 0.796875,
"rewards/format_reward_staging": 0.78125,
"step": 189
},
{
"epoch": 95.0,
"grad_norm": 12.174094790036403,
"learning_rate": 1.2588190451025209e-05,
"loss": 0.1562,
"step": 190
},
{
"epoch": 95.0,
"eval_clip_ratio": 0.0,
"eval_completion_length": 70.90625,
"eval_kl": 5.5693359375,
"eval_loss": 0.1942664086818695,
"eval_reward": 4.5625,
"eval_reward_std": 2.227724313735962,
"eval_rewards/accuracy_reward_staging": 0.296875,
"eval_rewards/format_reward": 0.796875,
"eval_rewards/format_reward_staging": 0.796875,
"eval_runtime": 11.384,
"eval_samples_per_second": 0.703,
"eval_steps_per_second": 0.088,
"step": 190
},
{
"clip_ratio": 0.0,
"completion_length": 67.8046875,
"epoch": 95.5,
"grad_norm": 29.4793161720303,
"kl": 7.30078125,
"learning_rate": 1.2503800040544417e-05,
"loss": 0.1191,
"reward": 2.21875,
"reward_std": 1.4497334137558937,
"rewards/accuracy_reward_staging": 0.078125,
"rewards/format_reward": 0.734375,
"rewards/format_reward_staging": 0.703125,
"step": 191
},
{
"clip_ratio": 0.0,
"completion_length": 60.0,
"epoch": 96.0,
"grad_norm": 18.528850308456384,
"kl": 5.0078125,
"learning_rate": 1.2419218955996677e-05,
"loss": -0.0102,
"reward": 6.4375,
"reward_std": 2.324888586997986,
"rewards/accuracy_reward_staging": 0.46875,
"rewards/format_reward": 0.859375,
"rewards/format_reward_staging": 0.890625,
"step": 192
},
{
"clip_ratio": 0.0,
"completion_length": 65.65625,
"epoch": 96.5,
"grad_norm": 321.1577068941354,
"kl": 10.72265625,
"learning_rate": 1.2334453638559057e-05,
"loss": 0.3994,
"reward": 3.5625,
"reward_std": 1.81937974691391,
"rewards/accuracy_reward_staging": 0.1875,
"rewards/format_reward": 0.84375,
"rewards/format_reward_staging": 0.84375,
"step": 193
},
{
"clip_ratio": 0.0,
"completion_length": 124.09375,
"epoch": 97.0,
"grad_norm": 298.30380544944734,
"kl": 16.484375,
"learning_rate": 1.2249510543438652e-05,
"loss": 1.1348,
"reward": 6.0,
"reward_std": 3.5904677510261536,
"rewards/accuracy_reward_staging": 0.453125,
"rewards/format_reward": 0.734375,
"rewards/format_reward_staging": 0.734375,
"step": 194
},
{
"clip_ratio": 0.0,
"completion_length": 61.265625,
"epoch": 97.5,
"grad_norm": 342.2488163586622,
"kl": 39.9609375,
"learning_rate": 1.2164396139381029e-05,
"loss": 0.6107,
"reward": 4.109375,
"reward_std": 1.6107770651578903,
"rewards/accuracy_reward_staging": 0.25,
"rewards/format_reward": 0.796875,
"rewards/format_reward_staging": 0.8125,
"step": 195
},
{
"clip_ratio": 0.0,
"completion_length": 71.5,
"epoch": 98.0,
"grad_norm": 332.30146930549876,
"kl": 18.890625,
"learning_rate": 1.2079116908177592e-05,
"loss": 0.7398,
"reward": 4.28125,
"reward_std": 3.740285500884056,
"rewards/accuracy_reward_staging": 0.3125,
"rewards/format_reward": 0.5625,
"rewards/format_reward_staging": 0.59375,
"step": 196
},
{
"clip_ratio": 0.0,
"completion_length": 57.046875,
"epoch": 98.5,
"grad_norm": 40.05423534554934,
"kl": 9.9453125,
"learning_rate": 1.1993679344171973e-05,
"loss": -0.0582,
"reward": 5.203125,
"reward_std": 3.098964586853981,
"rewards/accuracy_reward_staging": 0.375,
"rewards/format_reward": 0.734375,
"rewards/format_reward_staging": 0.71875,
"step": 197
},
{
"clip_ratio": 0.0,
"completion_length": 72.75,
"epoch": 99.0,
"grad_norm": 190.25413324992218,
"kl": 18.7421875,
"learning_rate": 1.190808995376545e-05,
"loss": 0.4647,
"reward": 2.796875,
"reward_std": 2.5183838307857513,
"rewards/accuracy_reward_staging": 0.15625,
"rewards/format_reward": 0.625,
"rewards/format_reward_staging": 0.609375,
"step": 198
},
{
"clip_ratio": 0.0,
"completion_length": 75.15625,
"epoch": 99.5,
"grad_norm": 77.45958102748877,
"kl": 8.5078125,
"learning_rate": 1.1822355254921478e-05,
"loss": 0.2798,
"reward": 5.265625,
"reward_std": 4.132492363452911,
"rewards/accuracy_reward_staging": 0.40625,
"rewards/format_reward": 0.59375,
"rewards/format_reward_staging": 0.609375,
"step": 199
},
{
"epoch": 100.0,
"grad_norm": 41.16387064311233,
"learning_rate": 1.1736481776669307e-05,
"loss": 0.1693,
"step": 200
},
{
"epoch": 100.0,
"eval_clip_ratio": 0.0,
"eval_completion_length": 74.859375,
"eval_kl": 11.505859375,
"eval_loss": 0.30862951278686523,
"eval_reward": 3.7578125,
"eval_reward_std": 3.54784195125103,
"eval_rewards/accuracy_reward_staging": 0.2578125,
"eval_rewards/format_reward": 0.59375,
"eval_rewards/format_reward_staging": 0.5859375,
"eval_runtime": 20.7253,
"eval_samples_per_second": 0.386,
"eval_steps_per_second": 0.048,
"step": 200
},
{
"clip_ratio": 0.0,
"completion_length": 62.8984375,
"epoch": 100.5,
"grad_norm": 46.99447269815204,
"kl": 7.478515625,
"learning_rate": 1.1650476058606776e-05,
"loss": 0.0531,
"reward": 3.7890625,
"reward_std": 2.675357274711132,
"rewards/accuracy_reward_staging": 0.2421875,
"rewards/format_reward": 0.6875,
"rewards/format_reward_staging": 0.6796875,
"step": 201
},
{
"clip_ratio": 0.0,
"completion_length": 112.5,
"epoch": 101.0,
"grad_norm": 87.46696486815911,
"kl": 10.09375,
"learning_rate": 1.156434465040231e-05,
"loss": 0.3972,
"reward": 3.765625,
"reward_std": 3.323235496878624,
"rewards/accuracy_reward_staging": 0.265625,
"rewards/format_reward": 0.5625,
"rewards/format_reward_staging": 0.546875,
"step": 202
},
{
"clip_ratio": 0.0,
"completion_length": 62.71875,
"epoch": 101.5,
"grad_norm": 18.374497201547754,
"kl": 3.76953125,
"learning_rate": 1.1478094111296109e-05,
"loss": -0.0432,
"reward": 3.28125,
"reward_std": 3.127034693956375,
"rewards/accuracy_reward_staging": 0.1875,
"rewards/format_reward": 0.703125,
"rewards/format_reward_staging": 0.703125,
"step": 203
},
{
"clip_ratio": 0.0,
"completion_length": 71.78125,
"epoch": 102.0,
"grad_norm": 29.592536211203182,
"kl": 2.9140625,
"learning_rate": 1.1391731009600655e-05,
"loss": 0.1431,
"reward": 5.21875,
"reward_std": 4.481558993458748,
"rewards/accuracy_reward_staging": 0.390625,
"rewards/format_reward": 0.65625,
"rewards/format_reward_staging": 0.65625,
"step": 204
},
{
"clip_ratio": 0.0,
"completion_length": 123.96875,
"epoch": 102.5,
"grad_norm": 33.395337096968554,
"kl": 2.564453125,
"learning_rate": 1.130526192220052e-05,
"loss": 0.2351,
"reward": 5.3125,
"reward_std": 3.812277674674988,
"rewards/accuracy_reward_staging": 0.390625,
"rewards/format_reward": 0.6875,
"rewards/format_reward_staging": 0.71875,
"step": 205
},
{
"clip_ratio": 0.0,
"completion_length": 70.5625,
"epoch": 103.0,
"grad_norm": 19.970853229745217,
"kl": 2.921875,
"learning_rate": 1.1218693434051475e-05,
"loss": 0.0446,
"reward": 4.78125,
"reward_std": 2.263821601867676,
"rewards/accuracy_reward_staging": 0.328125,
"rewards/format_reward": 0.765625,
"rewards/format_reward_staging": 0.734375,
"step": 206
},
{
"clip_ratio": 0.0,
"completion_length": 68.78125,
"epoch": 103.5,
"grad_norm": 14.17204450158029,
"kl": 2.71875,
"learning_rate": 1.113203213767907e-05,
"loss": 0.0621,
"reward": 3.28125,
"reward_std": 2.0387277007102966,
"rewards/accuracy_reward_staging": 0.1875,
"rewards/format_reward": 0.703125,
"rewards/format_reward_staging": 0.703125,
"step": 207
},
{
"clip_ratio": 0.0,
"completion_length": 98.84375,
"epoch": 104.0,
"grad_norm": 45.999753153217384,
"kl": 2.953125,
"learning_rate": 1.1045284632676535e-05,
"loss": 0.3512,
"reward": 7.703125,
"reward_std": 4.490310102701187,
"rewards/accuracy_reward_staging": 0.609375,
"rewards/format_reward": 0.8125,
"rewards/format_reward_staging": 0.796875,
"step": 208
},
{
"clip_ratio": 0.0,
"completion_length": 73.859375,
"epoch": 104.5,
"grad_norm": 16.39069761676693,
"kl": 2.822265625,
"learning_rate": 1.0958457525202241e-05,
"loss": 0.1927,
"reward": 7.15625,
"reward_std": 2.440823122859001,
"rewards/accuracy_reward_staging": 0.546875,
"rewards/format_reward": 0.84375,
"rewards/format_reward_staging": 0.84375,
"step": 209
},
{
"epoch": 105.0,
"grad_norm": 32.991938805145374,
"learning_rate": 1.0871557427476585e-05,
"loss": 0.2606,
"step": 210
},
{
"epoch": 105.0,
"eval_clip_ratio": 0.0,
"eval_completion_length": 76.171875,
"eval_kl": 2.79296875,
"eval_loss": 0.3493640124797821,
"eval_reward": 6.234375,
"eval_reward_std": 2.950258269906044,
"eval_rewards/accuracy_reward_staging": 0.453125,
"eval_rewards/format_reward": 0.8515625,
"eval_rewards/format_reward_staging": 0.8515625,
"eval_runtime": 17.079,
"eval_samples_per_second": 0.468,
"eval_steps_per_second": 0.059,
"step": 210
},
{
"clip_ratio": 0.0,
"completion_length": 80.3046875,
"epoch": 105.5,
"grad_norm": 14.591066359234059,
"kl": 3.3798828125,
"learning_rate": 1.0784590957278452e-05,
"loss": 0.2564,
"reward": 3.3984375,
"reward_std": 2.145370692014694,
"rewards/accuracy_reward_staging": 0.171875,
"rewards/format_reward": 0.8359375,
"rewards/format_reward_staging": 0.84375,
"step": 211
},
{
"clip_ratio": 0.0,
"completion_length": 76.625,
"epoch": 106.0,
"grad_norm": 213.24507222643214,
"kl": 15.8828125,
"learning_rate": 1.0697564737441254e-05,
"loss": 0.8925,
"reward": 9.28125,
"reward_std": 5.5177276730537415,
"rewards/accuracy_reward_staging": 0.765625,
"rewards/format_reward": 0.8125,
"rewards/format_reward_staging": 0.8125,
"step": 212
},
{
"clip_ratio": 0.0,
"completion_length": 110.328125,
"epoch": 106.5,
"grad_norm": 374.4514896454632,
"kl": 7.72265625,
"learning_rate": 1.0610485395348571e-05,
"loss": 1.0949,
"reward": 5.296875,
"reward_std": 2.8596606701612473,
"rewards/accuracy_reward_staging": 0.375,
"rewards/format_reward": 0.765625,
"rewards/format_reward_staging": 0.78125,
"step": 213
},
{
"clip_ratio": 0.0,
"completion_length": 69.875,
"epoch": 107.0,
"grad_norm": 1105.673343880607,
"kl": 33.830078125,
"learning_rate": 1.0523359562429441e-05,
"loss": 1.6287,
"reward": 7.4375,
"reward_std": 2.3971213698387146,
"rewards/accuracy_reward_staging": 0.5625,
"rewards/format_reward": 0.90625,
"rewards/format_reward_staging": 0.90625,
"step": 214
},
{
"clip_ratio": 0.0,
"completion_length": 73.890625,
"epoch": 107.5,
"grad_norm": 123.4698676668202,
"kl": 5.3046875,
"learning_rate": 1.0436193873653362e-05,
"loss": 0.3884,
"reward": 5.53125,
"reward_std": 2.4318894147872925,
"rewards/accuracy_reward_staging": 0.375,
"rewards/format_reward": 0.890625,
"rewards/format_reward_staging": 0.890625,
"step": 215
},
{
"clip_ratio": 0.0,
"completion_length": 102.03125,
"epoch": 108.0,
"grad_norm": 160.67968983733866,
"kl": 7.8359375,
"learning_rate": 1.0348994967025012e-05,
"loss": 0.9398,
"reward": 6.359375,
"reward_std": 4.46427845954895,
"rewards/accuracy_reward_staging": 0.484375,
"rewards/format_reward": 0.8125,
"rewards/format_reward_staging": 0.703125,
"step": 216
},
{
"clip_ratio": 0.0,
"completion_length": 75.0625,
"epoch": 108.5,
"grad_norm": 25.51936131101817,
"kl": 2.392578125,
"learning_rate": 1.0261769483078734e-05,
"loss": 0.2649,
"reward": 1.75,
"reward_std": 0.47541579604148865,
"rewards/accuracy_reward_staging": 0.0,
"rewards/format_reward": 0.921875,
"rewards/format_reward_staging": 0.828125,
"step": 217
},
{
"clip_ratio": 0.0,
"completion_length": 137.28125,
"epoch": 109.0,
"grad_norm": 148.7024918074023,
"kl": 5.13671875,
"learning_rate": 1.0174524064372837e-05,
"loss": 1.1347,
"reward": 11.765625,
"reward_std": 4.963782727718353,
"rewards/accuracy_reward_staging": 1.0,
"rewards/format_reward": 0.890625,
"rewards/format_reward_staging": 0.875,
"step": 218
},
{
"clip_ratio": 0.0,
"completion_length": 94.1875,
"epoch": 109.5,
"grad_norm": 42.61961427865945,
"kl": 5.0546875,
"learning_rate": 1.008726535498374e-05,
"loss": 0.8258,
"reward": 4.84375,
"reward_std": 2.711217939853668,
"rewards/accuracy_reward_staging": 0.328125,
"rewards/format_reward": 0.765625,
"rewards/format_reward_staging": 0.796875,
"step": 219
},
{
"epoch": 110.0,
"grad_norm": 134.36362522399034,
"learning_rate": 1e-05,
"loss": 1.1661,
"step": 220
},
{
"epoch": 110.0,
"eval_clip_ratio": 0.0,
"eval_completion_length": 117.09375,
"eval_kl": 4.9931640625,
"eval_loss": 0.5141848921775818,
"eval_reward": 6.1640625,
"eval_reward_std": 3.0512350350618362,
"eval_rewards/accuracy_reward_staging": 0.4453125,
"eval_rewards/format_reward": 0.84375,
"eval_rewards/format_reward_staging": 0.8671875,
"eval_runtime": 19.2232,
"eval_samples_per_second": 0.416,
"eval_steps_per_second": 0.052,
"step": 220
},
{
"clip_ratio": 0.0,
"completion_length": 133.65625,
"epoch": 110.5,
"grad_norm": 105.48263389855896,
"kl": 6.9462890625,
"learning_rate": 9.912734645016262e-06,
"loss": 0.8125,
"reward": 5.2578125,
"reward_std": 3.310664713382721,
"rewards/accuracy_reward_staging": 0.3671875,
"rewards/format_reward": 0.8125,
"rewards/format_reward_staging": 0.7734375,
"step": 221
},
{
"clip_ratio": 0.0,
"completion_length": 103.78125,
"epoch": 111.0,
"grad_norm": 28.059069132963547,
"kl": 6.390625,
"learning_rate": 9.825475935627165e-06,
"loss": 0.5255,
"reward": 8.9375,
"reward_std": 2.479752615094185,
"rewards/accuracy_reward_staging": 0.71875,
"rewards/format_reward": 0.890625,
"rewards/format_reward_staging": 0.859375,
"step": 222
},
{
"clip_ratio": 0.0,
"completion_length": 83.953125,
"epoch": 111.5,
"grad_norm": 71.56716252183017,
"kl": 2.30078125,
"learning_rate": 9.738230516921272e-06,
"loss": 0.4025,
"reward": 1.796875,
"reward_std": 0.54747274518013,
"rewards/accuracy_reward_staging": 0.0,
"rewards/format_reward": 0.890625,
"rewards/format_reward_staging": 0.90625,
"step": 223
},
{
"clip_ratio": 0.0,
"completion_length": 100.0,
"epoch": 112.0,
"grad_norm": 27.218944194745887,
"kl": 3.3046875,
"learning_rate": 9.651005032974994e-06,
"loss": 0.2448,
"reward": 10.125,
"reward_std": 5.460300043225288,
"rewards/accuracy_reward_staging": 0.84375,
"rewards/format_reward": 0.875,
"rewards/format_reward_staging": 0.8125,
"step": 224
},
{
"clip_ratio": 0.0,
"completion_length": 94.84375,
"epoch": 112.5,
"grad_norm": 68.22276023866495,
"kl": 2.3671875,
"learning_rate": 9.563806126346643e-06,
"loss": 0.5597,
"reward": 7.28125,
"reward_std": 2.8067111521959305,
"rewards/accuracy_reward_staging": 0.546875,
"rewards/format_reward": 0.90625,
"rewards/format_reward_staging": 0.90625,
"step": 225
},
{
"clip_ratio": 0.0,
"completion_length": 74.75,
"epoch": 113.0,
"grad_norm": 49.515502954596094,
"kl": 1.8046875,
"learning_rate": 9.476640437570562e-06,
"loss": 0.4145,
"reward": 5.734375,
"reward_std": 2.974400073289871,
"rewards/accuracy_reward_staging": 0.390625,
"rewards/format_reward": 0.9375,
"rewards/format_reward_staging": 0.890625,
"step": 226
},
{
"clip_ratio": 0.0,
"completion_length": 84.515625,
"epoch": 113.5,
"grad_norm": 80.52287464279028,
"kl": 1.96484375,
"learning_rate": 9.38951460465143e-06,
"loss": 0.5935,
"reward": 4.671875,
"reward_std": 2.0293429493904114,
"rewards/accuracy_reward_staging": 0.28125,
"rewards/format_reward": 0.9375,
"rewards/format_reward_staging": 0.921875,
"step": 227
},
{
"clip_ratio": 0.0,
"completion_length": 78.3125,
"epoch": 114.0,
"grad_norm": 79.3984300703632,
"kl": 9.23046875,
"learning_rate": 9.302435262558748e-06,
"loss": 0.5448,
"reward": 8.84375,
"reward_std": 3.9457506239414215,
"rewards/accuracy_reward_staging": 0.703125,
"rewards/format_reward": 0.90625,
"rewards/format_reward_staging": 0.90625,
"step": 228
},
{
"clip_ratio": 0.0,
"completion_length": 82.3125,
"epoch": 114.5,
"grad_norm": 144.0165272069023,
"kl": 7.28125,
"learning_rate": 9.215409042721553e-06,
"loss": 0.6451,
"reward": 5.078125,
"reward_std": 2.225419983267784,
"rewards/accuracy_reward_staging": 0.328125,
"rewards/format_reward": 0.90625,
"rewards/format_reward_staging": 0.890625,
"step": 229
},
{
"epoch": 115.0,
"grad_norm": 321.9641788098979,
"learning_rate": 9.128442572523418e-06,
"loss": 1.5742,
"step": 230
},
{
"epoch": 115.0,
"eval_clip_ratio": 0.0,
"eval_completion_length": 96.328125,
"eval_kl": 5.0361328125,
"eval_loss": 0.9659979939460754,
"eval_reward": 6.09375,
"eval_reward_std": 2.717326804995537,
"eval_rewards/accuracy_reward_staging": 0.4375,
"eval_rewards/format_reward": 0.8828125,
"eval_rewards/format_reward_staging": 0.8359375,
"eval_runtime": 21.7514,
"eval_samples_per_second": 0.368,
"eval_steps_per_second": 0.046,
"step": 230
},
{
"clip_ratio": 0.0,
"completion_length": 133.8828125,
"epoch": 115.5,
"grad_norm": 330.9234870737541,
"kl": 7.890625,
"learning_rate": 9.04154247479776e-06,
"loss": 1.4764,
"reward": 5.0703125,
"reward_std": 3.3657846450805664,
"rewards/accuracy_reward_staging": 0.34375,
"rewards/format_reward": 0.828125,
"rewards/format_reward_staging": 0.8046875,
"step": 231
},
{
"clip_ratio": 0.0,
"completion_length": 76.3125,
"epoch": 116.0,
"grad_norm": 110.48910505492681,
"kl": 4.37890625,
"learning_rate": 8.954715367323468e-06,
"loss": 0.5441,
"reward": 7.890625,
"reward_std": 3.448995918035507,
"rewards/accuracy_reward_staging": 0.609375,
"rewards/format_reward": 0.90625,
"rewards/format_reward_staging": 0.890625,
"step": 232
},
{
"clip_ratio": 0.0,
"completion_length": 90.125,
"epoch": 116.5,
"grad_norm": 24.431265643699565,
"kl": 2.525390625,
"learning_rate": 8.867967862320935e-06,
"loss": 0.498,
"reward": 5.21875,
"reward_std": 2.0470831990242004,
"rewards/accuracy_reward_staging": 0.34375,
"rewards/format_reward": 0.90625,
"rewards/format_reward_staging": 0.875,
"step": 233
},
{
"clip_ratio": 0.0,
"completion_length": 69.15625,
"epoch": 117.0,
"grad_norm": 22.167233415343524,
"kl": 3.0390625,
"learning_rate": 8.781306565948528e-06,
"loss": 0.2231,
"reward": 7.9375,
"reward_std": 2.1919010430574417,
"rewards/accuracy_reward_staging": 0.609375,
"rewards/format_reward": 0.9375,
"rewards/format_reward_staging": 0.90625,
"step": 234
},
{
"clip_ratio": 0.0,
"completion_length": 85.6875,
"epoch": 117.5,
"grad_norm": 15.990875997337938,
"kl": 2.197265625,
"learning_rate": 8.694738077799487e-06,
"loss": 0.3279,
"reward": 5.0,
"reward_std": 2.6655498147010803,
"rewards/accuracy_reward_staging": 0.328125,
"rewards/format_reward": 0.859375,
"rewards/format_reward_staging": 0.859375,
"step": 235
},
{
"clip_ratio": 0.0,
"completion_length": 69.34375,
"epoch": 118.0,
"grad_norm": 17.73495165238854,
"kl": 2.447265625,
"learning_rate": 8.60826899039935e-06,
"loss": -0.0273,
"reward": 7.859375,
"reward_std": 3.111342281103134,
"rewards/accuracy_reward_staging": 0.609375,
"rewards/format_reward": 0.875,
"rewards/format_reward_staging": 0.890625,
"step": 236
},
{
"clip_ratio": 0.0,
"completion_length": 102.9375,
"epoch": 118.5,
"grad_norm": 62.41785090382259,
"kl": 3.140625,
"learning_rate": 8.521905888703894e-06,
"loss": 0.4091,
"reward": 5.640625,
"reward_std": 2.2090050280094147,
"rewards/accuracy_reward_staging": 0.390625,
"rewards/format_reward": 0.859375,
"rewards/format_reward_staging": 0.875,
"step": 237
},
{
"clip_ratio": 0.0,
"completion_length": 99.9375,
"epoch": 119.0,
"grad_norm": 19.556702226372,
"kl": 1.966796875,
"learning_rate": 8.43565534959769e-06,
"loss": 0.3726,
"reward": 7.4375,
"reward_std": 3.3722455203533173,
"rewards/accuracy_reward_staging": 0.5625,
"rewards/format_reward": 0.9375,
"rewards/format_reward_staging": 0.875,
"step": 238
},
{
"clip_ratio": 0.0,
"completion_length": 97.28125,
"epoch": 119.5,
"grad_norm": 46.29528683228946,
"kl": 9.9296875,
"learning_rate": 8.349523941393224e-06,
"loss": 0.8543,
"reward": 6.15625,
"reward_std": 2.8786120861768723,
"rewards/accuracy_reward_staging": 0.4375,
"rewards/format_reward": 0.890625,
"rewards/format_reward_staging": 0.890625,
"step": 239
},
{
"epoch": 120.0,
"grad_norm": 74.29601903587687,
"learning_rate": 8.263518223330698e-06,
"loss": 0.6792,
"step": 240
},
{
"epoch": 120.0,
"eval_clip_ratio": 0.0,
"eval_completion_length": 85.34375,
"eval_kl": 3.4296875,
"eval_loss": 0.3411344885826111,
"eval_reward": 6.59375,
"eval_reward_std": 2.5715944170951843,
"eval_rewards/accuracy_reward_staging": 0.4765625,
"eval_rewards/format_reward": 0.90625,
"eval_rewards/format_reward_staging": 0.921875,
"eval_runtime": 14.6233,
"eval_samples_per_second": 0.547,
"eval_steps_per_second": 0.068,
"step": 240
},
{
"clip_ratio": 0.0,
"completion_length": 76.328125,
"epoch": 120.5,
"grad_norm": 10.644912372537606,
"kl": 2.2607421875,
"learning_rate": 8.177644745078525e-06,
"loss": 0.0768,
"reward": 6.1953125,
"reward_std": 2.617633506655693,
"rewards/accuracy_reward_staging": 0.4375,
"rewards/format_reward": 0.8984375,
"rewards/format_reward_staging": 0.921875,
"step": 241
},
{
"clip_ratio": 0.0,
"completion_length": 100.0625,
"epoch": 121.0,
"grad_norm": 111.92253062146008,
"kl": 3.75,
"learning_rate": 8.091910046234552e-06,
"loss": 0.5913,
"reward": 8.21875,
"reward_std": 2.894813686609268,
"rewards/accuracy_reward_staging": 0.640625,
"rewards/format_reward": 0.90625,
"rewards/format_reward_staging": 0.90625,
"step": 242
},
{
"clip_ratio": 0.0,
"completion_length": 82.8125,
"epoch": 121.5,
"grad_norm": 35.99238132907505,
"kl": 2.3828125,
"learning_rate": 8.00632065582803e-06,
"loss": 0.3878,
"reward": 4.171875,
"reward_std": 1.0875328481197357,
"rewards/accuracy_reward_staging": 0.234375,
"rewards/format_reward": 0.921875,
"rewards/format_reward_staging": 0.90625,
"step": 243
},
{
"clip_ratio": 0.0,
"completion_length": 99.71875,
"epoch": 122.0,
"grad_norm": 75.25420921507221,
"kl": 3.81640625,
"learning_rate": 7.92088309182241e-06,
"loss": 0.5724,
"reward": 9.671875,
"reward_std": 3.44248828291893,
"rewards/accuracy_reward_staging": 0.78125,
"rewards/format_reward": 0.921875,
"rewards/format_reward_staging": 0.9375,
"step": 244
},
{
"clip_ratio": 0.0,
"completion_length": 70.609375,
"epoch": 122.5,
"grad_norm": 2.6776020742038265,
"kl": 1.322265625,
"learning_rate": 7.835603860618973e-06,
"loss": 0.0461,
"reward": 5.3125,
"reward_std": 2.067808836698532,
"rewards/accuracy_reward_staging": 0.34375,
"rewards/format_reward": 0.953125,
"rewards/format_reward_staging": 0.921875,
"step": 245
},
{
"clip_ratio": 0.0,
"completion_length": 69.90625,
"epoch": 123.0,
"grad_norm": 122.57486492999168,
"kl": 6.97265625,
"learning_rate": 7.750489456561351e-06,
"loss": 0.5057,
"reward": 7.84375,
"reward_std": 2.4874102771282196,
"rewards/accuracy_reward_staging": 0.59375,
"rewards/format_reward": 0.953125,
"rewards/format_reward_staging": 0.953125,
"step": 246
},
{
"clip_ratio": 0.0,
"completion_length": 70.46875,
"epoch": 123.5,
"grad_norm": 7.815509886711204,
"kl": 2.482421875,
"learning_rate": 7.66554636144095e-06,
"loss": -0.0082,
"reward": 4.21875,
"reward_std": 0.9797288179397583,
"rewards/accuracy_reward_staging": 0.234375,
"rewards/format_reward": 0.9375,
"rewards/format_reward_staging": 0.9375,
"step": 247
},
{
"clip_ratio": 0.0,
"completion_length": 82.28125,
"epoch": 124.0,
"grad_norm": 45.837077069450096,
"kl": 2.609375,
"learning_rate": 7.580781044003324e-06,
"loss": 0.7357,
"reward": 9.953125,
"reward_std": 3.699725955724716,
"rewards/accuracy_reward_staging": 0.8125,
"rewards/format_reward": 0.90625,
"rewards/format_reward_staging": 0.921875,
"step": 248
},
{
"clip_ratio": 0.0,
"completion_length": 71.984375,
"epoch": 124.5,
"grad_norm": 2.8144133334907924,
"kl": 1.357421875,
"learning_rate": 7.496199959455584e-06,
"loss": 0.0622,
"reward": 5.71875,
"reward_std": 1.41599440574646,
"rewards/accuracy_reward_staging": 0.375,
"rewards/format_reward": 1.0,
"rewards/format_reward_staging": 0.96875,
"step": 249
},
{
"epoch": 125.0,
"grad_norm": 45.183676518213446,
"learning_rate": 7.411809548974792e-06,
"loss": 0.6075,
"step": 250
},
{
"epoch": 125.0,
"eval_clip_ratio": 0.0,
"eval_completion_length": 86.328125,
"eval_kl": 2.1416015625,
"eval_loss": 0.3427232503890991,
"eval_reward": 7.1171875,
"eval_reward_std": 2.1937270909547806,
"eval_rewards/accuracy_reward_staging": 0.5234375,
"eval_rewards/format_reward": 0.9375,
"eval_rewards/format_reward_staging": 0.9453125,
"eval_runtime": 20.0325,
"eval_samples_per_second": 0.399,
"eval_steps_per_second": 0.05,
"step": 250
},
{
"clip_ratio": 0.0,
"completion_length": 117.84375,
"epoch": 125.5,
"grad_norm": 49.63285548086623,
"kl": 4.2412109375,
"learning_rate": 7.327616239217432e-06,
"loss": 0.4637,
"reward": 8.2421875,
"reward_std": 2.541686810553074,
"rewards/accuracy_reward_staging": 0.640625,
"rewards/format_reward": 0.90625,
"rewards/format_reward_staging": 0.9296875,
"step": 251
},
{
"clip_ratio": 0.0,
"completion_length": 77.84375,
"epoch": 126.0,
"grad_norm": 92.39572072696474,
"kl": 6.615234375,
"learning_rate": 7.243626441830009e-06,
"loss": 0.5525,
"reward": 6.28125,
"reward_std": 1.9668870717287064,
"rewards/accuracy_reward_staging": 0.453125,
"rewards/format_reward": 0.859375,
"rewards/format_reward_staging": 0.890625,
"step": 252
},
{
"clip_ratio": 0.0,
"completion_length": 101.203125,
"epoch": 126.5,
"grad_norm": 10902.1669383775,
"kl": 80.533203125,
"learning_rate": 7.159846552960774e-06,
"loss": 17.4523,
"reward": 10.0625,
"reward_std": 3.160782814025879,
"rewards/accuracy_reward_staging": 0.8125,
"rewards/format_reward": 0.96875,
"rewards/format_reward_staging": 0.96875,
"step": 253
},
{
"clip_ratio": 0.0,
"completion_length": 100.96875,
"epoch": 127.0,
"grad_norm": 59.42071850320288,
"kl": 2.439453125,
"learning_rate": 7.076282952772634e-06,
"loss": 0.5484,
"reward": 4.140625,
"reward_std": 1.1416241526603699,
"rewards/accuracy_reward_staging": 0.234375,
"rewards/format_reward": 0.875,
"rewards/format_reward_staging": 0.921875,
"step": 254
},
{
"clip_ratio": 0.0,
"completion_length": 99.296875,
"epoch": 127.5,
"grad_norm": 39.19631141515369,
"kl": 3.0078125,
"learning_rate": 6.992942004957271e-06,
"loss": 0.4412,
"reward": 6.015625,
"reward_std": 1.9026378691196442,
"rewards/accuracy_reward_staging": 0.421875,
"rewards/format_reward": 0.890625,
"rewards/format_reward_staging": 0.90625,
"step": 255
},
{
"clip_ratio": 0.0,
"completion_length": 69.9375,
"epoch": 128.0,
"grad_norm": 32.45114944744183,
"kl": 3.859375,
"learning_rate": 6.909830056250527e-06,
"loss": 0.0636,
"reward": 8.984375,
"reward_std": 1.82603120803833,
"rewards/accuracy_reward_staging": 0.703125,
"rewards/format_reward": 0.96875,
"rewards/format_reward_staging": 0.984375,
"step": 256
},
{
"clip_ratio": 0.0,
"completion_length": 83.421875,
"epoch": 128.5,
"grad_norm": 37.72964417808751,
"kl": 2.177734375,
"learning_rate": 6.826953435949081e-06,
"loss": 0.4592,
"reward": 8.4375,
"reward_std": 1.9335529208183289,
"rewards/accuracy_reward_staging": 0.65625,
"rewards/format_reward": 0.9375,
"rewards/format_reward_staging": 0.9375,
"step": 257
},
{
"clip_ratio": 0.0,
"completion_length": 99.84375,
"epoch": 129.0,
"grad_norm": 29.569155006053123,
"kl": 2.361328125,
"learning_rate": 6.744318455428436e-06,
"loss": 0.6722,
"reward": 5.40625,
"reward_std": 2.279918909072876,
"rewards/accuracy_reward_staging": 0.359375,
"rewards/format_reward": 0.90625,
"rewards/format_reward_staging": 0.90625,
"step": 258
},
{
"clip_ratio": 0.0,
"completion_length": 113.359375,
"epoch": 129.5,
"grad_norm": 35.01023482911739,
"kl": 2.0,
"learning_rate": 6.661931407662292e-06,
"loss": 0.83,
"reward": 8.0625,
"reward_std": 3.0837645530700684,
"rewards/accuracy_reward_staging": 0.625,
"rewards/format_reward": 0.890625,
"rewards/format_reward_staging": 0.921875,
"step": 259
},
{
"epoch": 130.0,
"grad_norm": 8.168155459606666,
"learning_rate": 6.579798566743314e-06,
"loss": -0.0564,
"step": 260
},
{
"epoch": 130.0,
"eval_clip_ratio": 0.0,
"eval_completion_length": 92.515625,
"eval_kl": 4.40625,
"eval_loss": 0.7453440427780151,
"eval_reward": 7.3203125,
"eval_reward_std": 2.501967176795006,
"eval_rewards/accuracy_reward_staging": 0.546875,
"eval_rewards/format_reward": 0.9140625,
"eval_rewards/format_reward_staging": 0.9375,
"eval_runtime": 18.9405,
"eval_samples_per_second": 0.422,
"eval_steps_per_second": 0.053,
"step": 260
},
{
"clip_ratio": 0.0,
"completion_length": 91.9921875,
"epoch": 130.5,
"grad_norm": 82.11170212285718,
"kl": 3.4697265625,
"learning_rate": 6.497926187405326e-06,
"loss": 1.0187,
"reward": 7.8125,
"reward_std": 3.268296256661415,
"rewards/accuracy_reward_staging": 0.6015625,
"rewards/format_reward": 0.8828125,
"rewards/format_reward_staging": 0.9140625,
"step": 261
},
{
"clip_ratio": 0.0,
"completion_length": 69.40625,
"epoch": 131.0,
"grad_norm": 59.6837830116688,
"kl": 4.412109375,
"learning_rate": 6.4163205045469975e-06,
"loss": 0.529,
"reward": 4.15625,
"reward_std": 1.1744744330644608,
"rewards/accuracy_reward_staging": 0.234375,
"rewards/format_reward": 0.890625,
"rewards/format_reward_staging": 0.921875,
"step": 262
},
{
"clip_ratio": 0.0,
"completion_length": 100.359375,
"epoch": 131.5,
"grad_norm": 23.5284023934794,
"kl": 1.95703125,
"learning_rate": 6.334987732757028e-06,
"loss": 0.4948,
"reward": 9.71875,
"reward_std": 4.248636841773987,
"rewards/accuracy_reward_staging": 0.78125,
"rewards/format_reward": 0.953125,
"rewards/format_reward_staging": 0.953125,
"step": 263
},
{
"clip_ratio": 0.0,
"completion_length": 109.03125,
"epoch": 132.0,
"grad_norm": 30.888217422896634,
"kl": 2.41796875,
"learning_rate": 6.25393406584088e-06,
"loss": 0.4399,
"reward": 4.390625,
"reward_std": 0.2640564441680908,
"rewards/accuracy_reward_staging": 0.25,
"rewards/format_reward": 0.9375,
"rewards/format_reward_staging": 0.953125,
"step": 264
},
{
"clip_ratio": 0.0,
"completion_length": 116.28125,
"epoch": 132.5,
"grad_norm": 65.88875351387496,
"kl": 3.296875,
"learning_rate": 6.173165676349103e-06,
"loss": 0.9426,
"reward": 5.6875,
"reward_std": 2.6246196627616882,
"rewards/accuracy_reward_staging": 0.390625,
"rewards/format_reward": 0.90625,
"rewards/format_reward_staging": 0.875,
"step": 265
},
{
"clip_ratio": 0.0,
"completion_length": 108.25,
"epoch": 133.0,
"grad_norm": 28.80675313257115,
"kl": 2.587890625,
"learning_rate": 6.092688715107265e-06,
"loss": 0.7637,
"reward": 7.390625,
"reward_std": 3.304721415042877,
"rewards/accuracy_reward_staging": 0.5625,
"rewards/format_reward": 0.875,
"rewards/format_reward_staging": 0.890625,
"step": 266
},
{
"clip_ratio": 0.0,
"completion_length": 141.375,
"epoch": 133.5,
"grad_norm": 102.51920405283484,
"kl": 4.94140625,
"learning_rate": 6.0125093107475385e-06,
"loss": 0.8444,
"reward": 9.265625,
"reward_std": 3.659609690308571,
"rewards/accuracy_reward_staging": 0.75,
"rewards/format_reward": 0.890625,
"rewards/format_reward_staging": 0.875,
"step": 267
},
{
"clip_ratio": 0.0,
"completion_length": 73.71875,
"epoch": 134.0,
"grad_norm": 85.74081039820032,
"kl": 2.640625,
"learning_rate": 5.932633569242e-06,
"loss": 0.6961,
"reward": 4.15625,
"reward_std": 1.1662903726100922,
"rewards/accuracy_reward_staging": 0.234375,
"rewards/format_reward": 0.9375,
"rewards/format_reward_staging": 0.875,
"step": 268
},
{
"clip_ratio": 0.0,
"completion_length": 70.5,
"epoch": 134.5,
"grad_norm": 220.54398843264903,
"kl": 23.98828125,
"learning_rate": 5.853067573437612e-06,
"loss": 0.4613,
"reward": 4.078125,
"reward_std": 1.1815360486507416,
"rewards/accuracy_reward_staging": 0.21875,
"rewards/format_reward": 0.953125,
"rewards/format_reward_staging": 0.9375,
"step": 269
},
{
"epoch": 135.0,
"grad_norm": 69.3686388417835,
"learning_rate": 5.773817382593008e-06,
"loss": 0.8036,
"step": 270
},
{
"epoch": 135.0,
"eval_clip_ratio": 0.0,
"eval_completion_length": 133.703125,
"eval_kl": 2.3740234375,
"eval_loss": 0.5488725304603577,
"eval_reward": 6.65625,
"eval_reward_std": 2.7503354847431183,
"eval_rewards/accuracy_reward_staging": 0.4921875,
"eval_rewards/format_reward": 0.8828125,
"eval_rewards/format_reward_staging": 0.8515625,
"eval_runtime": 25.2428,
"eval_samples_per_second": 0.317,
"eval_steps_per_second": 0.04,
"step": 270
},
{
"clip_ratio": 0.0,
"completion_length": 87.28125,
"epoch": 135.5,
"grad_norm": 7.78574597343019,
"kl": 2.5751953125,
"learning_rate": 5.694889031917047e-06,
"loss": 0.0375,
"reward": 9.046875,
"reward_std": 2.9865086674690247,
"rewards/accuracy_reward_staging": 0.7265625,
"rewards/format_reward": 0.90625,
"rewards/format_reward_staging": 0.875,
"step": 271
},
{
"clip_ratio": 0.0,
"completion_length": 69.125,
"epoch": 136.0,
"grad_norm": 63.17030825405449,
"kl": 1.947265625,
"learning_rate": 5.616288532109225e-06,
"loss": 0.6616,
"reward": 6.046875,
"reward_std": 1.967979907989502,
"rewards/accuracy_reward_staging": 0.421875,
"rewards/format_reward": 0.9375,
"rewards/format_reward_staging": 0.890625,
"step": 272
},
{
"clip_ratio": 0.0,
"completion_length": 87.859375,
"epoch": 136.5,
"grad_norm": 5.340771645114791,
"kl": 2.205078125,
"learning_rate": 5.5380218689019125e-06,
"loss": 0.3324,
"reward": 5.265625,
"reward_std": 1.8316132873296738,
"rewards/accuracy_reward_staging": 0.34375,
"rewards/format_reward": 0.921875,
"rewards/format_reward_staging": 0.90625,
"step": 273
},
{
"clip_ratio": 0.0,
"completion_length": 178.15625,
"epoch": 137.0,
"grad_norm": 22.482391196655637,
"kl": 3.6875,
"learning_rate": 5.460095002604533e-06,
"loss": 0.4607,
"reward": 6.625,
"reward_std": 4.032912701368332,
"rewards/accuracy_reward_staging": 0.5,
"rewards/format_reward": 0.8125,
"rewards/format_reward_staging": 0.8125,
"step": 274
},
{
"clip_ratio": 0.0,
"completion_length": 73.421875,
"epoch": 137.5,
"grad_norm": 13.527272291689822,
"kl": 1.958984375,
"learning_rate": 5.382513867649663e-06,
"loss": 0.0469,
"reward": 6.0625,
"reward_std": 2.515269100666046,
"rewards/accuracy_reward_staging": 0.421875,
"rewards/format_reward": 0.9375,
"rewards/format_reward_staging": 0.90625,
"step": 275
},
{
"clip_ratio": 0.0,
"completion_length": 161.09375,
"epoch": 138.0,
"grad_norm": 610.5487969369213,
"kl": 6.744140625,
"learning_rate": 5.305284372141095e-06,
"loss": 1.6861,
"reward": 7.59375,
"reward_std": 3.4400684684515,
"rewards/accuracy_reward_staging": 0.609375,
"rewards/format_reward": 0.75,
"rewards/format_reward_staging": 0.75,
"step": 276
},
{
"clip_ratio": 0.0,
"completion_length": 204.046875,
"epoch": 138.5,
"grad_norm": 40.311818383902995,
"kl": 43.591796875,
"learning_rate": 5.228412397403916e-06,
"loss": 0.7055,
"reward": 4.25,
"reward_std": 2.983577609062195,
"rewards/accuracy_reward_staging": 0.265625,
"rewards/format_reward": 0.8125,
"rewards/format_reward_staging": 0.78125,
"step": 277
},
{
"clip_ratio": 0.0,
"completion_length": 74.875,
"epoch": 139.0,
"grad_norm": 38.02177180776487,
"kl": 4.14453125,
"learning_rate": 5.151903797536631e-06,
"loss": 0.6329,
"reward": 6.765625,
"reward_std": 2.8731206506490707,
"rewards/accuracy_reward_staging": 0.5,
"rewards/format_reward": 0.890625,
"rewards/format_reward_staging": 0.875,
"step": 278
},
{
"clip_ratio": 0.0,
"completion_length": 102.953125,
"epoch": 139.5,
"grad_norm": 15.046522887761398,
"kl": 2.10546875,
"learning_rate": 5.075764398965331e-06,
"loss": 0.4068,
"reward": 1.96875,
"reward_std": 1.4316468834877014,
"rewards/accuracy_reward_staging": 0.03125,
"rewards/format_reward": 0.84375,
"rewards/format_reward_staging": 0.8125,
"step": 279
},
{
"epoch": 140.0,
"grad_norm": 21.00508786132047,
"learning_rate": 5.000000000000003e-06,
"loss": 0.3769,
"step": 280
},
{
"epoch": 140.0,
"eval_clip_ratio": 0.0,
"eval_completion_length": 108.140625,
"eval_kl": 2.791015625,
"eval_loss": 0.3034168779850006,
"eval_reward": 5.671875,
"eval_reward_std": 3.041542984545231,
"eval_rewards/accuracy_reward_staging": 0.3984375,
"eval_rewards/format_reward": 0.8515625,
"eval_rewards/format_reward_staging": 0.8359375,
"eval_runtime": 26.3078,
"eval_samples_per_second": 0.304,
"eval_steps_per_second": 0.038,
"step": 280
},
{
"clip_ratio": 0.0,
"completion_length": 170.3359375,
"epoch": 140.5,
"grad_norm": 83.46938912876466,
"kl": 1.9345703125,
"learning_rate": 4.924616370392962e-06,
"loss": 0.6293,
"reward": 7.96875,
"reward_std": 4.6416375786066055,
"rewards/accuracy_reward_staging": 0.6328125,
"rewards/format_reward": 0.8359375,
"rewards/format_reward_staging": 0.8046875,
"step": 281
},
{
"clip_ratio": 0.0,
"completion_length": 127.6875,
"epoch": 141.0,
"grad_norm": 9.042778775609163,
"kl": 2.693359375,
"learning_rate": 4.849619250899458e-06,
"loss": 0.3913,
"reward": 5.640625,
"reward_std": 2.8702940493822098,
"rewards/accuracy_reward_staging": 0.390625,
"rewards/format_reward": 0.84375,
"rewards/format_reward_staging": 0.890625,
"step": 282
},
{
"clip_ratio": 0.0,
"completion_length": 139.03125,
"epoch": 141.5,
"grad_norm": 49.6878570546121,
"kl": 3.50390625,
"learning_rate": 4.775014352840512e-06,
"loss": 0.9548,
"reward": 6.390625,
"reward_std": 3.8147538006305695,
"rewards/accuracy_reward_staging": 0.484375,
"rewards/format_reward": 0.78125,
"rewards/format_reward_staging": 0.765625,
"step": 283
},
{
"clip_ratio": 0.0,
"completion_length": 70.84375,
"epoch": 142.0,
"grad_norm": 2.936217141963846,
"kl": 1.498046875,
"learning_rate": 4.700807357667953e-06,
"loss": 0.0258,
"reward": 5.015625,
"reward_std": 1.7188784629106522,
"rewards/accuracy_reward_staging": 0.3125,
"rewards/format_reward": 0.96875,
"rewards/format_reward_staging": 0.921875,
"step": 284
},
{
"clip_ratio": 0.0,
"completion_length": 234.46875,
"epoch": 142.5,
"grad_norm": 231.51752728996516,
"kl": 7.8046875,
"learning_rate": 4.627003916531761e-06,
"loss": 2.1654,
"reward": 3.1875,
"reward_std": 2.9490927308797836,
"rewards/accuracy_reward_staging": 0.171875,
"rewards/format_reward": 0.75,
"rewards/format_reward_staging": 0.71875,
"step": 285
},
{
"clip_ratio": 0.0,
"completion_length": 132.8125,
"epoch": 143.0,
"grad_norm": 55.72832613480593,
"kl": 1.64453125,
"learning_rate": 4.5536096498497295e-06,
"loss": 1.0135,
"reward": 7.109375,
"reward_std": 2.9616019427776337,
"rewards/accuracy_reward_staging": 0.53125,
"rewards/format_reward": 0.875,
"rewards/format_reward_staging": 0.921875,
"step": 286
},
{
"clip_ratio": 0.0,
"completion_length": 131.984375,
"epoch": 143.5,
"grad_norm": 500.69423335954184,
"kl": 9.09375,
"learning_rate": 4.480630146879419e-06,
"loss": 0.8396,
"reward": 4.234375,
"reward_std": 2.296322599053383,
"rewards/accuracy_reward_staging": 0.265625,
"rewards/format_reward": 0.8125,
"rewards/format_reward_staging": 0.765625,
"step": 287
},
{
"clip_ratio": 0.0,
"completion_length": 163.25,
"epoch": 144.0,
"grad_norm": 78.65105992380144,
"kl": 1.7265625,
"learning_rate": 4.408070965292534e-06,
"loss": 0.5859,
"reward": 6.453125,
"reward_std": 3.582520604133606,
"rewards/accuracy_reward_staging": 0.46875,
"rewards/format_reward": 0.890625,
"rewards/format_reward_staging": 0.875,
"step": 288
},
{
"clip_ratio": 0.0,
"completion_length": 152.4375,
"epoch": 144.5,
"grad_norm": 145.11899661299873,
"kl": 3.505859375,
"learning_rate": 4.335937630751675e-06,
"loss": 0.8424,
"reward": 3.5,
"reward_std": 2.5658179223537445,
"rewards/accuracy_reward_staging": 0.1875,
"rewards/format_reward": 0.828125,
"rewards/format_reward_staging": 0.796875,
"step": 289
},
{
"epoch": 145.0,
"grad_norm": 21.829024770798654,
"learning_rate": 4.264235636489542e-06,
"loss": 0.4036,
"step": 290
},
{
"epoch": 145.0,
"eval_clip_ratio": 0.0,
"eval_completion_length": 118.2890625,
"eval_kl": 2.580078125,
"eval_loss": 0.45962250232696533,
"eval_reward": 5.0390625,
"eval_reward_std": 2.569364294409752,
"eval_rewards/accuracy_reward_staging": 0.3359375,
"eval_rewards/format_reward": 0.8515625,
"eval_rewards/format_reward_staging": 0.828125,
"eval_runtime": 25.2258,
"eval_samples_per_second": 0.317,
"eval_steps_per_second": 0.04,
"step": 290
},
{
"clip_ratio": 0.0,
"completion_length": 94.2578125,
"epoch": 145.5,
"grad_norm": 99.26323743266907,
"kl": 2.0009765625,
"learning_rate": 4.192970442890602e-06,
"loss": 0.9692,
"reward": 6.3828125,
"reward_std": 2.864999257028103,
"rewards/accuracy_reward_staging": 0.4609375,
"rewards/format_reward": 0.90625,
"rewards/format_reward_staging": 0.8671875,
"step": 291
},
{
"clip_ratio": 0.0,
"completion_length": 242.90625,
"epoch": 146.0,
"grad_norm": 2195.8990163427725,
"kl": 20.560546875,
"learning_rate": 4.12214747707527e-06,
"loss": 2.8321,
"reward": 3.453125,
"reward_std": 2.1853206753730774,
"rewards/accuracy_reward_staging": 0.1875,
"rewards/format_reward": 0.78125,
"rewards/format_reward_staging": 0.796875,
"step": 292
},
{
"clip_ratio": 0.0,
"completion_length": 70.984375,
"epoch": 146.5,
"grad_norm": 5.612016216510325,
"kl": 1.904296875,
"learning_rate": 4.051772132486589e-06,
"loss": -0.0196,
"reward": 6.5625,
"reward_std": 1.6346493661403656,
"rewards/accuracy_reward_staging": 0.46875,
"rewards/format_reward": 0.9375,
"rewards/format_reward_staging": 0.9375,
"step": 293
},
{
"clip_ratio": 0.0,
"completion_length": 116.75,
"epoch": 147.0,
"grad_norm": 24.439737033122462,
"kl": 5.474609375,
"learning_rate": 3.981849768479516e-06,
"loss": 1.072,
"reward": 3.296875,
"reward_std": 2.2417181879281998,
"rewards/accuracy_reward_staging": 0.171875,
"rewards/format_reward": 0.796875,
"rewards/format_reward_staging": 0.78125,
"step": 294
},
{
"clip_ratio": 0.0,
"completion_length": 219.84375,
"epoch": 147.5,
"grad_norm": 149.74167295056185,
"kl": 6.009765625,
"learning_rate": 3.912385709912794e-06,
"loss": 1.0669,
"reward": 4.1875,
"reward_std": 2.5385715812444687,
"rewards/accuracy_reward_staging": 0.265625,
"rewards/format_reward": 0.75,
"rewards/format_reward_staging": 0.78125,
"step": 295
},
{
"clip_ratio": 0.0,
"completion_length": 98.15625,
"epoch": 148.0,
"grad_norm": 19.302640459669014,
"kl": 2.12109375,
"learning_rate": 3.8433852467434175e-06,
"loss": 0.4691,
"reward": 6.453125,
"reward_std": 2.6673848778009415,
"rewards/accuracy_reward_staging": 0.46875,
"rewards/format_reward": 0.875,
"rewards/format_reward_staging": 0.890625,
"step": 296
},
{
"clip_ratio": 0.0,
"completion_length": 190.71875,
"epoch": 148.5,
"grad_norm": 20.309279907380233,
"kl": 3.623046875,
"learning_rate": 3.774853633623806e-06,
"loss": 0.8401,
"reward": 4.875,
"reward_std": 2.2293783873319626,
"rewards/accuracy_reward_staging": 0.328125,
"rewards/format_reward": 0.796875,
"rewards/format_reward_staging": 0.796875,
"step": 297
},
{
"clip_ratio": 0.0,
"completion_length": 131.5,
"epoch": 149.0,
"grad_norm": 239.5186164999186,
"kl": 3.22265625,
"learning_rate": 3.7067960895016277e-06,
"loss": 1.4061,
"reward": 6.734375,
"reward_std": 3.5877325236797333,
"rewards/accuracy_reward_staging": 0.5,
"rewards/format_reward": 0.875,
"rewards/format_reward_staging": 0.859375,
"step": 298
},
{
"clip_ratio": 0.0,
"completion_length": 132.84375,
"epoch": 149.5,
"grad_norm": 14.77833452952522,
"kl": 2.74609375,
"learning_rate": 3.6392177972223596e-06,
"loss": 1.032,
"reward": 3.984375,
"reward_std": 1.3325151801109314,
"rewards/accuracy_reward_staging": 0.234375,
"rewards/format_reward": 0.796875,
"rewards/format_reward_staging": 0.84375,
"step": 299
},
{
"epoch": 150.0,
"grad_norm": 11.339677519890218,
"learning_rate": 3.5721239031346067e-06,
"loss": 0.9908,
"step": 300
},
{
"epoch": 150.0,
"eval_clip_ratio": 0.0,
"eval_completion_length": 153.859375,
"eval_kl": 2.41015625,
"eval_loss": 0.36936071515083313,
"eval_reward": 5.0234375,
"eval_reward_std": 2.693635329604149,
"eval_rewards/accuracy_reward_staging": 0.3359375,
"eval_rewards/format_reward": 0.8203125,
"eval_rewards/format_reward_staging": 0.84375,
"eval_runtime": 30.2609,
"eval_samples_per_second": 0.264,
"eval_steps_per_second": 0.033,
"step": 300
},
{
"clip_ratio": 0.0,
"completion_length": 105.4609375,
"epoch": 150.5,
"grad_norm": 13.48307651344578,
"kl": 2.556640625,
"learning_rate": 3.505519516698165e-06,
"loss": 0.8506,
"reward": 5.8515625,
"reward_std": 2.9722983986139297,
"rewards/accuracy_reward_staging": 0.4140625,
"rewards/format_reward": 0.84375,
"rewards/format_reward_staging": 0.8671875,
"step": 301
},
{
"clip_ratio": 0.0,
"completion_length": 148.3125,
"epoch": 151.0,
"grad_norm": 86.33754338379147,
"kl": 4.666015625,
"learning_rate": 3.4394097100949286e-06,
"loss": 0.7356,
"reward": 6.453125,
"reward_std": 3.2461503744125366,
"rewards/accuracy_reward_staging": 0.484375,
"rewards/format_reward": 0.8125,
"rewards/format_reward_staging": 0.796875,
"step": 302
},
{
"clip_ratio": 0.0,
"completion_length": 124.109375,
"epoch": 151.5,
"grad_norm": 10.864856379472597,
"kl": 2.119140625,
"learning_rate": 3.3737995178426276e-06,
"loss": 0.6197,
"reward": 10.125,
"reward_std": 4.969914525747299,
"rewards/accuracy_reward_staging": 0.84375,
"rewards/format_reward": 0.828125,
"rewards/format_reward_staging": 0.859375,
"step": 303
},
{
"clip_ratio": 0.0,
"completion_length": 126.375,
"epoch": 152.0,
"grad_norm": 14.28207912655626,
"kl": 6.5078125,
"learning_rate": 3.308693936411421e-06,
"loss": 0.7237,
"reward": 1.8125,
"reward_std": 1.1962126940488815,
"rewards/accuracy_reward_staging": 0.015625,
"rewards/format_reward": 0.84375,
"rewards/format_reward_staging": 0.8125,
"step": 304
},
{
"clip_ratio": 0.0,
"completion_length": 70.15625,
"epoch": 152.5,
"grad_norm": 129.83316187917453,
"kl": 5.720703125,
"learning_rate": 3.2440979238433977e-06,
"loss": 0.2064,
"reward": 4.328125,
"reward_std": 0.47754141688346863,
"rewards/accuracy_reward_staging": 0.25,
"rewards/format_reward": 0.90625,
"rewards/format_reward_staging": 0.921875,
"step": 305
},
{
"clip_ratio": 0.0,
"completion_length": 168.125,
"epoch": 153.0,
"grad_norm": 14.37299684325007,
"kl": 3.173828125,
"learning_rate": 3.1800163993750166e-06,
"loss": 0.4395,
"reward": 8.953125,
"reward_std": 4.095484673976898,
"rewards/accuracy_reward_staging": 0.71875,
"rewards/format_reward": 0.875,
"rewards/format_reward_staging": 0.890625,
"step": 306
},
{
"clip_ratio": 0.0,
"completion_length": 167.015625,
"epoch": 153.5,
"grad_norm": 6089.787677039342,
"kl": 134.119140625,
"learning_rate": 3.116454243062459e-06,
"loss": 6.8109,
"reward": 4.640625,
"reward_std": 2.973371237516403,
"rewards/accuracy_reward_staging": 0.296875,
"rewards/format_reward": 0.828125,
"rewards/format_reward_staging": 0.84375,
"step": 307
},
{
"clip_ratio": 0.0,
"completion_length": 100.0,
"epoch": 154.0,
"grad_norm": 18.82555217372982,
"kl": 1.556640625,
"learning_rate": 3.0534162954100264e-06,
"loss": 0.7008,
"reward": 8.09375,
"reward_std": 2.37243390083313,
"rewards/accuracy_reward_staging": 0.625,
"rewards/format_reward": 0.921875,
"rewards/format_reward_staging": 0.921875,
"step": 308
},
{
"clip_ratio": 0.0,
"completion_length": 145.140625,
"epoch": 154.5,
"grad_norm": 65.27226167321199,
"kl": 8.71484375,
"learning_rate": 2.990907357001491e-06,
"loss": 0.7082,
"reward": 7.609375,
"reward_std": 2.803087517619133,
"rewards/accuracy_reward_staging": 0.59375,
"rewards/format_reward": 0.84375,
"rewards/format_reward_staging": 0.828125,
"step": 309
},
{
"epoch": 155.0,
"grad_norm": 11.996878111801491,
"learning_rate": 2.9289321881345257e-06,
"loss": 0.412,
"step": 310
},
{
"epoch": 155.0,
"eval_clip_ratio": 0.0,
"eval_completion_length": 143.1015625,
"eval_kl": 1.8232421875,
"eval_loss": 1.0165082216262817,
"eval_reward": 6.234375,
"eval_reward_std": 2.7126059383153915,
"eval_rewards/accuracy_reward_staging": 0.4453125,
"eval_rewards/format_reward": 0.890625,
"eval_rewards/format_reward_staging": 0.890625,
"eval_runtime": 43.8596,
"eval_samples_per_second": 0.182,
"eval_steps_per_second": 0.023,
"step": 310
},
{
"clip_ratio": 0.0,
"completion_length": 116.7578125,
"epoch": 155.5,
"grad_norm": 20.569051772890276,
"kl": 1.759765625,
"learning_rate": 2.867495508458186e-06,
"loss": 0.2798,
"reward": 4.6484375,
"reward_std": 1.7857269644737244,
"rewards/accuracy_reward_staging": 0.28125,
"rewards/format_reward": 0.9140625,
"rewards/format_reward_staging": 0.921875,
"step": 311
},
{
"clip_ratio": 0.0,
"completion_length": 119.875,
"epoch": 156.0,
"grad_norm": 66.09995163248371,
"kl": 2.607421875,
"learning_rate": 2.8066019966134907e-06,
"loss": 1.0753,
"reward": 8.84375,
"reward_std": 4.13113260269165,
"rewards/accuracy_reward_staging": 0.71875,
"rewards/format_reward": 0.8125,
"rewards/format_reward_staging": 0.84375,
"step": 312
},
{
"clip_ratio": 0.0,
"completion_length": 102.953125,
"epoch": 156.5,
"grad_norm": 8.187280314820471,
"kl": 3.30859375,
"learning_rate": 2.746256289877126e-06,
"loss": 0.3809,
"reward": 9.421875,
"reward_std": 3.6460390239953995,
"rewards/accuracy_reward_staging": 0.765625,
"rewards/format_reward": 0.875,
"rewards/format_reward_staging": 0.890625,
"step": 313
},
{
"clip_ratio": 0.0,
"completion_length": 131.0625,
"epoch": 157.0,
"grad_norm": 5.144738650630215,
"kl": 2.107421875,
"learning_rate": 2.6864629838082957e-06,
"loss": 0.6558,
"reward": 4.109375,
"reward_std": 1.1705190539360046,
"rewards/accuracy_reward_staging": 0.234375,
"rewards/format_reward": 0.875,
"rewards/format_reward_staging": 0.890625,
"step": 314
},
{
"clip_ratio": 0.0,
"completion_length": 116.03125,
"epoch": 157.5,
"grad_norm": 25.762327223945867,
"kl": 3.041015625,
"learning_rate": 2.6272266318987606e-06,
"loss": 0.7186,
"reward": 8.234375,
"reward_std": 2.516293704509735,
"rewards/accuracy_reward_staging": 0.640625,
"rewards/format_reward": 0.90625,
"rewards/format_reward_staging": 0.921875,
"step": 315
},
{
"clip_ratio": 0.0,
"completion_length": 106.53125,
"epoch": 158.0,
"grad_norm": 13.742094957162939,
"kl": 2.0859375,
"learning_rate": 2.5685517452260566e-06,
"loss": 0.6218,
"reward": 5.78125,
"reward_std": 2.4056650549173355,
"rewards/accuracy_reward_staging": 0.40625,
"rewards/format_reward": 0.859375,
"rewards/format_reward_staging": 0.859375,
"step": 316
},
{
"clip_ratio": 0.0,
"completion_length": 126.1875,
"epoch": 158.5,
"grad_norm": 16.04349339503592,
"kl": 2.7265625,
"learning_rate": 2.5104427921099783e-06,
"loss": 0.3688,
"reward": 5.984375,
"reward_std": 2.61943382024765,
"rewards/accuracy_reward_staging": 0.421875,
"rewards/format_reward": 0.875,
"rewards/format_reward_staging": 0.890625,
"step": 317
},
{
"clip_ratio": 0.0,
"completion_length": 99.15625,
"epoch": 159.0,
"grad_norm": 9.050419274649617,
"kl": 4.8828125,
"learning_rate": 2.45290419777228e-06,
"loss": 0.2878,
"reward": 6.5625,
"reward_std": 3.38299697637558,
"rewards/accuracy_reward_staging": 0.484375,
"rewards/format_reward": 0.859375,
"rewards/format_reward_staging": 0.859375,
"step": 318
},
{
"clip_ratio": 0.0,
"completion_length": 169.609375,
"epoch": 159.5,
"grad_norm": 29.02785370667893,
"kl": 4.904296875,
"learning_rate": 2.395940343999691e-06,
"loss": 0.5377,
"reward": 5.65625,
"reward_std": 2.224781885743141,
"rewards/accuracy_reward_staging": 0.40625,
"rewards/format_reward": 0.796875,
"rewards/format_reward_staging": 0.796875,
"step": 319
},
{
"epoch": 160.0,
"grad_norm": 147.1237773705263,
"learning_rate": 2.339555568810221e-06,
"loss": 0.9691,
"step": 320
},
{
"epoch": 160.0,
"eval_clip_ratio": 0.0,
"eval_completion_length": 167.6640625,
"eval_kl": 1.9326171875,
"eval_loss": 0.7716435790061951,
"eval_reward": 6.6484375,
"eval_reward_std": 2.216530680656433,
"eval_rewards/accuracy_reward_staging": 0.4921875,
"eval_rewards/format_reward": 0.84375,
"eval_rewards/format_reward_staging": 0.8828125,
"eval_runtime": 31.1003,
"eval_samples_per_second": 0.257,
"eval_steps_per_second": 0.032,
"step": 320
},
{
"clip_ratio": 0.0,
"completion_length": 117.1796875,
"epoch": 160.5,
"grad_norm": 123.65805346609463,
"kl": 7.9814453125,
"learning_rate": 2.2837541661228024e-06,
"loss": 0.9258,
"reward": 5.8671875,
"reward_std": 1.9323227554559708,
"rewards/accuracy_reward_staging": 0.4140625,
"rewards/format_reward": 0.8359375,
"rewards/format_reward_staging": 0.890625,
"step": 321
},
{
"clip_ratio": 0.0,
"completion_length": 189.71875,
"epoch": 161.0,
"grad_norm": 4.10729901700796,
"kl": 2.5625,
"learning_rate": 2.2285403854302912e-06,
"loss": 0.5784,
"reward": 7.734375,
"reward_std": 5.214049249887466,
"rewards/accuracy_reward_staging": 0.625,
"rewards/format_reward": 0.734375,
"rewards/format_reward_staging": 0.75,
"step": 322
},
{
"clip_ratio": 0.0,
"completion_length": 218.84375,
"epoch": 161.5,
"grad_norm": 26.92757845878815,
"kl": 2.748046875,
"learning_rate": 2.173918431475861e-06,
"loss": 0.8819,
"reward": 9.484375,
"reward_std": 5.199484676122665,
"rewards/accuracy_reward_staging": 0.796875,
"rewards/format_reward": 0.75,
"rewards/format_reward_staging": 0.765625,
"step": 323
},
{
"clip_ratio": 0.0,
"completion_length": 117.75,
"epoch": 162.0,
"grad_norm": 13.57050998781947,
"kl": 1.6328125,
"learning_rate": 2.119892463932781e-06,
"loss": 0.6371,
"reward": 1.671875,
"reward_std": 0.47480545938014984,
"rewards/accuracy_reward_staging": 0.0,
"rewards/format_reward": 0.8125,
"rewards/format_reward_staging": 0.859375,
"step": 324
},
{
"clip_ratio": 0.0,
"completion_length": 157.125,
"epoch": 162.5,
"grad_norm": 4.089653024083605,
"kl": 2.189453125,
"learning_rate": 2.0664665970876496e-06,
"loss": 0.1603,
"reward": 3.640625,
"reward_std": 3.069836288690567,
"rewards/accuracy_reward_staging": 0.203125,
"rewards/format_reward": 0.828125,
"rewards/format_reward_staging": 0.78125,
"step": 325
},
{
"clip_ratio": 0.0,
"completion_length": 224.4375,
"epoch": 163.0,
"grad_norm": 17.786059949816178,
"kl": 5.447265625,
"learning_rate": 2.013644899527074e-06,
"loss": 1.1477,
"reward": 7.015625,
"reward_std": 3.553600549697876,
"rewards/accuracy_reward_staging": 0.546875,
"rewards/format_reward": 0.734375,
"rewards/format_reward_staging": 0.8125,
"step": 326
},
{
"clip_ratio": 0.0,
"completion_length": 189.75,
"epoch": 163.5,
"grad_norm": 9.621427088420068,
"kl": 9.373046875,
"learning_rate": 1.961431393827827e-06,
"loss": 0.7149,
"reward": 4.359375,
"reward_std": 3.1985532343387604,
"rewards/accuracy_reward_staging": 0.296875,
"rewards/format_reward": 0.6875,
"rewards/format_reward_staging": 0.703125,
"step": 327
},
{
"clip_ratio": 0.0,
"completion_length": 229.59375,
"epoch": 164.0,
"grad_norm": 76.47407109945985,
"kl": 3.7265625,
"learning_rate": 1.9098300562505266e-06,
"loss": 0.6987,
"reward": 3.78125,
"reward_std": 3.0820604413747787,
"rewards/accuracy_reward_staging": 0.25,
"rewards/format_reward": 0.640625,
"rewards/format_reward_staging": 0.640625,
"step": 328
},
{
"clip_ratio": 0.0,
"completion_length": 230.1875,
"epoch": 164.5,
"grad_norm": 6.994840256019957,
"kl": 2.673828125,
"learning_rate": 1.858844816436809e-06,
"loss": 0.3282,
"reward": 5.0625,
"reward_std": 3.695622056722641,
"rewards/accuracy_reward_staging": 0.359375,
"rewards/format_reward": 0.71875,
"rewards/format_reward_staging": 0.75,
"step": 329
},
{
"epoch": 165.0,
"grad_norm": 9.165691965744147,
"learning_rate": 1.808479557110081e-06,
"loss": 0.2334,
"step": 330
},
{
"epoch": 165.0,
"eval_clip_ratio": 0.0,
"eval_completion_length": 101.296875,
"eval_kl": 3.767578125,
"eval_loss": 0.18460440635681152,
"eval_reward": 5.234375,
"eval_reward_std": 2.932789586484432,
"eval_rewards/accuracy_reward_staging": 0.3671875,
"eval_rewards/format_reward": 0.7734375,
"eval_rewards/format_reward_staging": 0.7890625,
"eval_runtime": 19.4506,
"eval_samples_per_second": 0.411,
"eval_steps_per_second": 0.051,
"step": 330
},
{
"clip_ratio": 0.0,
"completion_length": 120.9609375,
"epoch": 165.5,
"grad_norm": 7.198852766772789,
"kl": 3.2666015625,
"learning_rate": 1.7587381137798432e-06,
"loss": 0.0493,
"reward": 4.8671875,
"reward_std": 1.5325812175869942,
"rewards/accuracy_reward_staging": 0.3203125,
"rewards/format_reward": 0.828125,
"rewards/format_reward_staging": 0.8359375,
"step": 331
},
{
"clip_ratio": 0.0,
"completion_length": 247.46875,
"epoch": 166.0,
"grad_norm": 8.940874552922875,
"kl": 7.59765625,
"learning_rate": 1.709624274449584e-06,
"loss": 0.2349,
"reward": 6.109375,
"reward_std": 4.367333948612213,
"rewards/accuracy_reward_staging": 0.5,
"rewards/format_reward": 0.546875,
"rewards/format_reward_staging": 0.5625,
"step": 332
},
{
"clip_ratio": 0.0,
"completion_length": 128.6875,
"epoch": 166.5,
"grad_norm": 29.279440961120276,
"kl": 4.328125,
"learning_rate": 1.6611417793283192e-06,
"loss": 1.111,
"reward": 7.34375,
"reward_std": 3.199866473674774,
"rewards/accuracy_reward_staging": 0.5625,
"rewards/format_reward": 0.828125,
"rewards/format_reward_staging": 0.890625,
"step": 333
},
{
"clip_ratio": 0.0,
"completion_length": 164.21875,
"epoch": 167.0,
"grad_norm": 7.572100523744779,
"kl": 3.65625,
"learning_rate": 1.6132943205457607e-06,
"loss": 0.2248,
"reward": 2.0625,
"reward_std": 2.6606018245220184,
"rewards/accuracy_reward_staging": 0.09375,
"rewards/format_reward": 0.546875,
"rewards/format_reward_staging": 0.578125,
"step": 334
},
{
"clip_ratio": 0.0,
"completion_length": 117.0,
"epoch": 167.5,
"grad_norm": 38.94662493791989,
"kl": 2.041015625,
"learning_rate": 1.566085541871145e-06,
"loss": 0.9721,
"reward": 2.0,
"reward_std": 1.3283163905143738,
"rewards/accuracy_reward_staging": 0.03125,
"rewards/format_reward": 0.828125,
"rewards/format_reward_staging": 0.859375,
"step": 335
},
{
"clip_ratio": 0.0,
"completion_length": 257.84375,
"epoch": 168.0,
"grad_norm": 12.030868864292964,
"kl": 2.9765625,
"learning_rate": 1.5195190384357405e-06,
"loss": 0.347,
"reward": 7.15625,
"reward_std": 5.804089158773422,
"rewards/accuracy_reward_staging": 0.609375,
"rewards/format_reward": 0.546875,
"rewards/format_reward_staging": 0.515625,
"step": 336
},
{
"clip_ratio": 0.0,
"completion_length": 149.484375,
"epoch": 168.5,
"grad_norm": 4.888452594544771,
"kl": 2.025390625,
"learning_rate": 1.4735983564590784e-06,
"loss": 0.1412,
"reward": 5.875,
"reward_std": 1.471491515636444,
"rewards/accuracy_reward_staging": 0.4375,
"rewards/format_reward": 0.734375,
"rewards/format_reward_staging": 0.765625,
"step": 337
},
{
"clip_ratio": 0.0,
"completion_length": 124.65625,
"epoch": 169.0,
"grad_norm": 14.198275343939475,
"kl": 6.076171875,
"learning_rate": 1.4283269929788779e-06,
"loss": 0.1445,
"reward": 5.4375,
"reward_std": 3.588435083627701,
"rewards/accuracy_reward_staging": 0.390625,
"rewards/format_reward": 0.75,
"rewards/format_reward_staging": 0.78125,
"step": 338
},
{
"clip_ratio": 0.0,
"completion_length": 138.421875,
"epoch": 169.5,
"grad_norm": 58.80975012160593,
"kl": 4.6015625,
"learning_rate": 1.3837083955847418e-06,
"loss": 0.4146,
"reward": 4.328125,
"reward_std": 2.9866636991500854,
"rewards/accuracy_reward_staging": 0.28125,
"rewards/format_reward": 0.75,
"rewards/format_reward_staging": 0.765625,
"step": 339
},
{
"epoch": 170.0,
"grad_norm": 12.936108165176186,
"learning_rate": 1.339745962155613e-06,
"loss": 0.4075,
"step": 340
},
{
"epoch": 170.0,
"eval_clip_ratio": 0.0,
"eval_completion_length": 158.3125,
"eval_kl": 3.1328125,
"eval_loss": 0.5399841070175171,
"eval_reward": 4.9921875,
"eval_reward_std": 3.382713630795479,
"eval_rewards/accuracy_reward_staging": 0.3515625,
"eval_rewards/format_reward": 0.734375,
"eval_rewards/format_reward_staging": 0.7421875,
"eval_runtime": 32.245,
"eval_samples_per_second": 0.248,
"eval_steps_per_second": 0.031,
"step": 340
},
{
"clip_ratio": 0.0,
"completion_length": 127.2109375,
"epoch": 170.5,
"grad_norm": 16.397843407801986,
"kl": 5.271484375,
"learning_rate": 1.2964430406010032e-06,
"loss": 0.4235,
"reward": 5.1875,
"reward_std": 2.5617306530475616,
"rewards/accuracy_reward_staging": 0.375,
"rewards/format_reward": 0.7109375,
"rewards/format_reward_staging": 0.7265625,
"step": 341
},
{
"clip_ratio": 0.0,
"completion_length": 181.03125,
"epoch": 171.0,
"grad_norm": 10.177450513861592,
"kl": 6.2265625,
"learning_rate": 1.2538029286060428e-06,
"loss": 0.2828,
"reward": 4.0,
"reward_std": 3.566714286804199,
"rewards/accuracy_reward_staging": 0.265625,
"rewards/format_reward": 0.65625,
"rewards/format_reward_staging": 0.6875,
"step": 342
},
{
"clip_ratio": 0.0,
"completion_length": 77.390625,
"epoch": 171.5,
"grad_norm": 5.299938896991584,
"kl": 2.30859375,
"learning_rate": 1.2118288733803474e-06,
"loss": 0.1981,
"reward": 1.828125,
"reward_std": 0.4453761428594589,
"rewards/accuracy_reward_staging": 0.0,
"rewards/format_reward": 0.90625,
"rewards/format_reward_staging": 0.921875,
"step": 343
},
{
"clip_ratio": 0.0,
"completion_length": 233.1875,
"epoch": 172.0,
"grad_norm": 9.72856878961463,
"kl": 2.810546875,
"learning_rate": 1.1705240714107301e-06,
"loss": 0.3691,
"reward": 7.234375,
"reward_std": 4.719546392560005,
"rewards/accuracy_reward_staging": 0.609375,
"rewards/format_reward": 0.5625,
"rewards/format_reward_staging": 0.578125,
"step": 344
},
{
"clip_ratio": 0.0,
"completion_length": 156.34375,
"epoch": 172.5,
"grad_norm": 13.674261851507499,
"kl": 5.080078125,
"learning_rate": 1.129891668217783e-06,
"loss": 0.3796,
"reward": 1.515625,
"reward_std": 0.3468210846185684,
"rewards/accuracy_reward_staging": 0.0,
"rewards/format_reward": 0.75,
"rewards/format_reward_staging": 0.765625,
"step": 345
},
{
"clip_ratio": 0.0,
"completion_length": 125.0625,
"epoch": 173.0,
"grad_norm": 17.14371369730622,
"kl": 3.12109375,
"learning_rate": 1.0899347581163222e-06,
"loss": 0.491,
"reward": 8.140625,
"reward_std": 1.9854381084442139,
"rewards/accuracy_reward_staging": 0.671875,
"rewards/format_reward": 0.703125,
"rewards/format_reward_staging": 0.71875,
"step": 346
},
{
"clip_ratio": 0.0,
"completion_length": 193.859375,
"epoch": 173.5,
"grad_norm": 12.20716781608,
"kl": 3.224609375,
"learning_rate": 1.0506563839797501e-06,
"loss": 0.0906,
"reward": 3.6875,
"reward_std": 3.41329425573349,
"rewards/accuracy_reward_staging": 0.234375,
"rewards/format_reward": 0.671875,
"rewards/format_reward_staging": 0.671875,
"step": 347
},
{
"clip_ratio": 0.0,
"completion_length": 152.8125,
"epoch": 174.0,
"grad_norm": 5.4458912166318,
"kl": 6.1875,
"learning_rate": 1.012059537008332e-06,
"loss": 0.2197,
"reward": 4.15625,
"reward_std": 2.4828383028507233,
"rewards/accuracy_reward_staging": 0.296875,
"rewards/format_reward": 0.546875,
"rewards/format_reward_staging": 0.640625,
"step": 348
},
{
"clip_ratio": 0.0,
"completion_length": 123.328125,
"epoch": 174.5,
"grad_norm": 11.502073299068844,
"kl": 5.08203125,
"learning_rate": 9.74147156501396e-07,
"loss": 0.233,
"reward": 4.921875,
"reward_std": 2.9725054800510406,
"rewards/accuracy_reward_staging": 0.359375,
"rewards/format_reward": 0.65625,
"rewards/format_reward_staging": 0.671875,
"step": 349
},
{
"epoch": 175.0,
"grad_norm": 41.38843600405391,
"learning_rate": 9.369221296335007e-07,
"loss": 0.3717,
"step": 350
},
{
"epoch": 175.0,
"eval_clip_ratio": 0.0,
"eval_completion_length": 150.5703125,
"eval_kl": 5.74609375,
"eval_loss": 0.37665992975234985,
"eval_reward": 3.9921875,
"eval_reward_std": 2.3707948103547096,
"eval_rewards/accuracy_reward_staging": 0.265625,
"eval_rewards/format_reward": 0.671875,
"eval_rewards/format_reward_staging": 0.6640625,
"eval_runtime": 29.995,
"eval_samples_per_second": 0.267,
"eval_steps_per_second": 0.033,
"step": 350
},
{
"clip_ratio": 0.0,
"completion_length": 176.296875,
"epoch": 175.5,
"grad_norm": 14.284328609401078,
"kl": 2.892578125,
"learning_rate": 9.00387291234569e-07,
"loss": 0.3948,
"reward": 5.5390625,
"reward_std": 2.7443336844444275,
"rewards/accuracy_reward_staging": 0.4140625,
"rewards/format_reward": 0.6953125,
"rewards/format_reward_staging": 0.703125,
"step": 351
},
{
"clip_ratio": 0.0,
"completion_length": 184.53125,
"epoch": 176.0,
"grad_norm": 59.21817249281153,
"kl": 1.8359375,
"learning_rate": 8.645454235739903e-07,
"loss": 0.94,
"reward": 1.421875,
"reward_std": 0.3859764039516449,
"rewards/accuracy_reward_staging": 0.0,
"rewards/format_reward": 0.6875,
"rewards/format_reward_staging": 0.734375,
"step": 352
},
{
"clip_ratio": 0.0,
"completion_length": 225.109375,
"epoch": 176.5,
"grad_norm": 5.721722376999943,
"kl": 3.16015625,
"learning_rate": 8.293992561487596e-07,
"loss": 0.187,
"reward": 5.75,
"reward_std": 1.985233724117279,
"rewards/accuracy_reward_staging": 0.46875,
"rewards/format_reward": 0.515625,
"rewards/format_reward_staging": 0.546875,
"step": 353
},
{
"clip_ratio": 0.0,
"completion_length": 95.09375,
"epoch": 177.0,
"grad_norm": 11.943217011119804,
"kl": 1.8671875,
"learning_rate": 7.949514654755963e-07,
"loss": 0.5445,
"reward": 3.546875,
"reward_std": 2.243361175060272,
"rewards/accuracy_reward_staging": 0.171875,
"rewards/format_reward": 0.90625,
"rewards/format_reward_staging": 0.921875,
"step": 354
},
{
"clip_ratio": 0.0,
"completion_length": 159.359375,
"epoch": 177.5,
"grad_norm": 7.480612147716681,
"kl": 8.45703125,
"learning_rate": 7.612046748871327e-07,
"loss": 0.3634,
"reward": 1.421875,
"reward_std": 1.0414101481437683,
"rewards/accuracy_reward_staging": 0.015625,
"rewards/format_reward": 0.625,
"rewards/format_reward_staging": 0.640625,
"step": 355
},
{
"clip_ratio": 0.0,
"completion_length": 208.21875,
"epoch": 178.0,
"grad_norm": 18.429391483064126,
"kl": 2.62109375,
"learning_rate": 7.281614543321269e-07,
"loss": 0.5711,
"reward": 7.0625,
"reward_std": 3.4084277749061584,
"rewards/accuracy_reward_staging": 0.578125,
"rewards/format_reward": 0.625,
"rewards/format_reward_staging": 0.65625,
"step": 356
},
{
"clip_ratio": 0.0,
"completion_length": 252.484375,
"epoch": 178.5,
"grad_norm": 14.537737450397852,
"kl": 4.646484375,
"learning_rate": 6.958243201797554e-07,
"loss": 0.1274,
"reward": 5.28125,
"reward_std": 3.292633891105652,
"rewards/accuracy_reward_staging": 0.4375,
"rewards/format_reward": 0.453125,
"rewards/format_reward_staging": 0.453125,
"step": 357
},
{
"clip_ratio": 0.0,
"completion_length": 92.875,
"epoch": 179.0,
"grad_norm": 37.718281676975764,
"kl": 7.54296875,
"learning_rate": 6.641957350279838e-07,
"loss": 0.2581,
"reward": 1.765625,
"reward_std": 0.5727441757917404,
"rewards/accuracy_reward_staging": 0.0,
"rewards/format_reward": 0.875,
"rewards/format_reward_staging": 0.890625,
"step": 358
},
{
"clip_ratio": 0.0,
"completion_length": 151.78125,
"epoch": 179.5,
"grad_norm": 6.103859851208992,
"kl": 2.55078125,
"learning_rate": 6.332781075160244e-07,
"loss": 0.1261,
"reward": 3.34375,
"reward_std": 3.3706541061401367,
"rewards/accuracy_reward_staging": 0.1875,
"rewards/format_reward": 0.71875,
"rewards/format_reward_staging": 0.75,
"step": 359
},
{
"epoch": 180.0,
"grad_norm": 16.044853615424703,
"learning_rate": 6.030737921409169e-07,
"loss": 0.4139,
"step": 360
},
{
"epoch": 180.0,
"eval_clip_ratio": 0.0,
"eval_completion_length": 120.9140625,
"eval_kl": 2.34375,
"eval_loss": 0.017587810754776,
"eval_reward": 3.7421875,
"eval_reward_std": 1.6082397252321243,
"eval_rewards/accuracy_reward_staging": 0.234375,
"eval_rewards/format_reward": 0.6796875,
"eval_rewards/format_reward_staging": 0.71875,
"eval_runtime": 19.8514,
"eval_samples_per_second": 0.403,
"eval_steps_per_second": 0.05,
"step": 360
},
{
"clip_ratio": 0.0,
"completion_length": 156.7421875,
"epoch": 180.5,
"grad_norm": 75.37216219372046,
"kl": 4.494140625,
"learning_rate": 5.735850890782158e-07,
"loss": 0.1449,
"reward": 6.59375,
"reward_std": 3.172459162771702,
"rewards/accuracy_reward_staging": 0.4921875,
"rewards/format_reward": 0.828125,
"rewards/format_reward_staging": 0.84375,
"step": 361
},
{
"clip_ratio": 0.0,
"completion_length": 247.46875,
"epoch": 181.0,
"grad_norm": 12.075035951064045,
"kl": 3.07421875,
"learning_rate": 5.448142440068316e-07,
"loss": 0.1604,
"reward": 1.09375,
"reward_std": 0.9126968681812286,
"rewards/accuracy_reward_staging": 0.015625,
"rewards/format_reward": 0.46875,
"rewards/format_reward_staging": 0.46875,
"step": 362
},
{
"clip_ratio": 0.0,
"completion_length": 155.796875,
"epoch": 181.5,
"grad_norm": 64.11377087450238,
"kl": 3.0390625,
"learning_rate": 5.167634479380068e-07,
"loss": 0.8806,
"reward": 1.984375,
"reward_std": 1.6587499380111694,
"rewards/accuracy_reward_staging": 0.078125,
"rewards/format_reward": 0.59375,
"rewards/format_reward_staging": 0.609375,
"step": 363
},
{
"clip_ratio": 0.0,
"completion_length": 175.96875,
"epoch": 182.0,
"grad_norm": 14.895018127249944,
"kl": 3.53125,
"learning_rate": 4.894348370484648e-07,
"loss": 0.2306,
"reward": 4.734375,
"reward_std": 3.2918047457933426,
"rewards/accuracy_reward_staging": 0.34375,
"rewards/format_reward": 0.65625,
"rewards/format_reward_staging": 0.640625,
"step": 364
},
{
"clip_ratio": 0.0,
"completion_length": 99.234375,
"epoch": 182.5,
"grad_norm": 7.957954932541399,
"kl": 2.3203125,
"learning_rate": 4.628304925177318e-07,
"loss": 0.044,
"reward": 7.359375,
"reward_std": 3.293235272169113,
"rewards/accuracy_reward_staging": 0.546875,
"rewards/format_reward": 0.9375,
"rewards/format_reward_staging": 0.953125,
"step": 365
},
{
"clip_ratio": 0.0,
"completion_length": 212.15625,
"epoch": 183.0,
"grad_norm": 13.244592420338774,
"kl": 4.23046875,
"learning_rate": 4.3695244036964567e-07,
"loss": 0.4,
"reward": 0.859375,
"reward_std": 0.5324036777019501,
"rewards/accuracy_reward_staging": 0.0,
"rewards/format_reward": 0.421875,
"rewards/format_reward_staging": 0.4375,
"step": 366
},
{
"clip_ratio": 0.0,
"completion_length": 82.6875,
"epoch": 183.5,
"grad_norm": 23.479873436751383,
"kl": 13.568359375,
"learning_rate": 4.118026513180695e-07,
"loss": 0.286,
"reward": 6.6875,
"reward_std": 3.936906337738037,
"rewards/accuracy_reward_staging": 0.484375,
"rewards/format_reward": 0.9375,
"rewards/format_reward_staging": 0.90625,
"step": 367
},
{
"clip_ratio": 0.0,
"completion_length": 220.9375,
"epoch": 184.0,
"grad_norm": 10.070112240993417,
"kl": 2.416015625,
"learning_rate": 3.8738304061681107e-07,
"loss": 0.2163,
"reward": 1.09375,
"reward_std": 0.8050735592842102,
"rewards/accuracy_reward_staging": 0.015625,
"rewards/format_reward": 0.46875,
"rewards/format_reward_staging": 0.46875,
"step": 368
},
{
"clip_ratio": 0.0,
"completion_length": 88.8125,
"epoch": 184.5,
"grad_norm": 8.136056851141001,
"kl": 2.525390625,
"learning_rate": 3.6369546791377054e-07,
"loss": 0.1381,
"reward": 6.765625,
"reward_std": 4.866852879524231,
"rewards/accuracy_reward_staging": 0.53125,
"rewards/format_reward": 0.71875,
"rewards/format_reward_staging": 0.734375,
"step": 369
},
{
"epoch": 185.0,
"grad_norm": 17.037859451924284,
"learning_rate": 3.4074173710931804e-07,
"loss": 0.3005,
"step": 370
},
{
"epoch": 185.0,
"eval_clip_ratio": 0.0,
"eval_completion_length": 144.296875,
"eval_kl": 3.22265625,
"eval_loss": 0.20356737077236176,
"eval_reward": 3.8203125,
"eval_reward_std": 2.0116966366767883,
"eval_rewards/accuracy_reward_staging": 0.25,
"eval_rewards/format_reward": 0.6484375,
"eval_rewards/format_reward_staging": 0.671875,
"eval_runtime": 30.7935,
"eval_samples_per_second": 0.26,
"eval_steps_per_second": 0.032,
"step": 370
},
{
"clip_ratio": 0.0,
"completion_length": 147.8671875,
"epoch": 185.5,
"grad_norm": 18.635491260401505,
"kl": 4.578125,
"learning_rate": 3.185235962189237e-07,
"loss": 0.3295,
"reward": 3.8046875,
"reward_std": 1.7106563821434975,
"rewards/accuracy_reward_staging": 0.2421875,
"rewards/format_reward": 0.6796875,
"rewards/format_reward_staging": 0.703125,
"step": 371
},
{
"clip_ratio": 0.0,
"completion_length": 140.125,
"epoch": 186.0,
"grad_norm": 14.577568071606951,
"kl": 3.724609375,
"learning_rate": 2.970427372400353e-07,
"loss": 0.0855,
"reward": 1.4375,
"reward_std": 0.6442917585372925,
"rewards/accuracy_reward_staging": 0.0,
"rewards/format_reward": 0.703125,
"rewards/format_reward_staging": 0.734375,
"step": 372
},
{
"clip_ratio": 0.0,
"completion_length": 184.890625,
"epoch": 186.5,
"grad_norm": 58.14646429098023,
"kl": 1.859375,
"learning_rate": 2.7630079602323447e-07,
"loss": 0.4805,
"reward": 7.09375,
"reward_std": 3.6945143938064575,
"rewards/accuracy_reward_staging": 0.5625,
"rewards/format_reward": 0.71875,
"rewards/format_reward_staging": 0.75,
"step": 373
},
{
"clip_ratio": 0.0,
"completion_length": 97.53125,
"epoch": 187.0,
"grad_norm": 17.627898991545116,
"kl": 3.81640625,
"learning_rate": 2.5629935214764866e-07,
"loss": 0.5385,
"reward": 1.5625,
"reward_std": 0.9268264472484589,
"rewards/accuracy_reward_staging": 0.015625,
"rewards/format_reward": 0.6875,
"rewards/format_reward_staging": 0.71875,
"step": 374
},
{
"clip_ratio": 0.0,
"completion_length": 183.8125,
"epoch": 187.5,
"grad_norm": 9.59043190634086,
"kl": 3.857421875,
"learning_rate": 2.370399288006664e-07,
"loss": 0.1804,
"reward": 2.703125,
"reward_std": 3.5622373819351196,
"rewards/accuracy_reward_staging": 0.171875,
"rewards/format_reward": 0.484375,
"rewards/format_reward_staging": 0.5,
"step": 375
},
{
"clip_ratio": 0.0,
"completion_length": 113.0625,
"epoch": 188.0,
"grad_norm": 16.299906961139566,
"kl": 2.330078125,
"learning_rate": 2.1852399266194312e-07,
"loss": 0.4884,
"reward": 6.109375,
"reward_std": 1.688461884856224,
"rewards/accuracy_reward_staging": 0.4375,
"rewards/format_reward": 0.859375,
"rewards/format_reward_staging": 0.875,
"step": 376
},
{
"clip_ratio": 0.0,
"completion_length": 182.75,
"epoch": 188.5,
"grad_norm": 8.539805865049278,
"kl": 8.970703125,
"learning_rate": 2.0075295379170413e-07,
"loss": 0.3236,
"reward": 1.71875,
"reward_std": 2.325968086719513,
"rewards/accuracy_reward_staging": 0.078125,
"rewards/format_reward": 0.46875,
"rewards/format_reward_staging": 0.46875,
"step": 377
},
{
"clip_ratio": 0.0,
"completion_length": 73.34375,
"epoch": 189.0,
"grad_norm": 10.441791159370938,
"kl": 2.33203125,
"learning_rate": 1.8372816552336025e-07,
"loss": 0.1111,
"reward": 7.28125,
"reward_std": 3.7130661606788635,
"rewards/accuracy_reward_staging": 0.546875,
"rewards/format_reward": 0.890625,
"rewards/format_reward_staging": 0.921875,
"step": 378
},
{
"clip_ratio": 0.0,
"completion_length": 140.890625,
"epoch": 189.5,
"grad_norm": 15.428094188720301,
"kl": 4.2265625,
"learning_rate": 1.6745092436045495e-07,
"loss": 0.0711,
"reward": 1.875,
"reward_std": 1.2960638105869293,
"rewards/accuracy_reward_staging": 0.046875,
"rewards/format_reward": 0.6875,
"rewards/format_reward_staging": 0.71875,
"step": 379
},
{
"epoch": 190.0,
"grad_norm": 7.7297837980318445,
"learning_rate": 1.519224698779198e-07,
"loss": 0.2035,
"step": 380
},
{
"epoch": 190.0,
"eval_clip_ratio": 0.0,
"eval_completion_length": 101.8515625,
"eval_kl": 16.283203125,
"eval_loss": 0.3184351921081543,
"eval_reward": 3.796875,
"eval_reward_std": 3.2891100347042084,
"eval_rewards/accuracy_reward_staging": 0.2421875,
"eval_rewards/format_reward": 0.6875,
"eval_rewards/format_reward_staging": 0.6875,
"eval_runtime": 28.2419,
"eval_samples_per_second": 0.283,
"eval_steps_per_second": 0.035,
"step": 380
},
{
"clip_ratio": 0.0,
"completion_length": 93.7890625,
"epoch": 190.5,
"grad_norm": 6.692292880766585,
"kl": 3.47265625,
"learning_rate": 1.3714398462768563e-07,
"loss": 0.0341,
"reward": 3.6484375,
"reward_std": 2.5661590471863747,
"rewards/accuracy_reward_staging": 0.2265625,
"rewards/format_reward": 0.6953125,
"rewards/format_reward_staging": 0.6875,
"step": 381
},
{
"clip_ratio": 0.0,
"completion_length": 126.25,
"epoch": 191.0,
"grad_norm": 12.116075689587404,
"kl": 6.255859375,
"learning_rate": 1.231165940486234e-07,
"loss": 0.0657,
"reward": 5.890625,
"reward_std": 2.3291388154029846,
"rewards/accuracy_reward_staging": 0.453125,
"rewards/format_reward": 0.671875,
"rewards/format_reward_staging": 0.6875,
"step": 382
},
{
"clip_ratio": 0.0,
"completion_length": 180.640625,
"epoch": 191.5,
"grad_norm": 12.664375356889414,
"kl": 5.28515625,
"learning_rate": 1.0984136638083176e-07,
"loss": 0.3136,
"reward": 2.53125,
"reward_std": 3.584800824522972,
"rewards/accuracy_reward_staging": 0.15625,
"rewards/format_reward": 0.484375,
"rewards/format_reward_staging": 0.484375,
"step": 383
},
{
"clip_ratio": 0.0,
"completion_length": 64.40625,
"epoch": 192.0,
"grad_norm": 4.339973094865359,
"kl": 1.96484375,
"learning_rate": 9.731931258429638e-08,
"loss": -0.0634,
"reward": 5.75,
"reward_std": 1.7209889590740204,
"rewards/accuracy_reward_staging": 0.390625,
"rewards/format_reward": 0.90625,
"rewards/format_reward_staging": 0.9375,
"step": 384
},
{
"clip_ratio": 0.0,
"completion_length": 118.625,
"epoch": 192.5,
"grad_norm": 11.3243870187843,
"kl": 2.5078125,
"learning_rate": 8.555138626189619e-08,
"loss": 0.3026,
"reward": 5.828125,
"reward_std": 3.4805214405059814,
"rewards/accuracy_reward_staging": 0.4375,
"rewards/format_reward": 0.71875,
"rewards/format_reward_staging": 0.734375,
"step": 385
},
{
"clip_ratio": 0.0,
"completion_length": 160.5,
"epoch": 193.0,
"grad_norm": 13.116416295027339,
"kl": 7.072265625,
"learning_rate": 7.453848358678018e-08,
"loss": 0.4106,
"reward": 1.515625,
"reward_std": 0.8902590423822403,
"rewards/accuracy_reward_staging": 0.015625,
"rewards/format_reward": 0.65625,
"rewards/format_reward_staging": 0.703125,
"step": 386
},
{
"clip_ratio": 0.0,
"completion_length": 70.296875,
"epoch": 193.5,
"grad_norm": 25.501237737582994,
"kl": 4.64453125,
"learning_rate": 6.428144323412544e-08,
"loss": 0.0485,
"reward": 1.6875,
"reward_std": 0.6990881264209747,
"rewards/accuracy_reward_staging": 0.0,
"rewards/format_reward": 0.84375,
"rewards/format_reward_staging": 0.84375,
"step": 387
},
{
"clip_ratio": 0.0,
"completion_length": 134.8125,
"epoch": 194.0,
"grad_norm": 11.249206018043907,
"kl": 4.14453125,
"learning_rate": 5.4781046317267103e-08,
"loss": 0.1459,
"reward": 6.109375,
"reward_std": 3.1836692690849304,
"rewards/accuracy_reward_staging": 0.515625,
"rewards/format_reward": 0.46875,
"rewards/format_reward_staging": 0.484375,
"step": 388
},
{
"clip_ratio": 0.0,
"completion_length": 98.65625,
"epoch": 194.5,
"grad_norm": 29.24345766629281,
"kl": 15.025390625,
"learning_rate": 4.603801632821148e-08,
"loss": 0.1154,
"reward": 1.859375,
"reward_std": 2.3215357810258865,
"rewards/accuracy_reward_staging": 0.046875,
"rewards/format_reward": 0.703125,
"rewards/format_reward_staging": 0.6875,
"step": 389
},
{
"epoch": 195.0,
"grad_norm": 9.771374335396073,
"learning_rate": 3.805301908254455e-08,
"loss": 0.0831,
"step": 390
},
{
"epoch": 195.0,
"eval_clip_ratio": 0.0,
"eval_completion_length": 118.71875,
"eval_kl": 5.439453125,
"eval_loss": 0.20848847925662994,
"eval_reward": 3.8515625,
"eval_reward_std": 2.876259058713913,
"eval_rewards/accuracy_reward_staging": 0.25,
"eval_rewards/format_reward": 0.6640625,
"eval_rewards/format_reward_staging": 0.6875,
"eval_runtime": 28.0016,
"eval_samples_per_second": 0.286,
"eval_steps_per_second": 0.036,
"step": 390
},
{
"clip_ratio": 0.0,
"completion_length": 131.8515625,
"epoch": 195.5,
"grad_norm": 18.170540683038464,
"kl": 4.927734375,
"learning_rate": 3.082666266872036e-08,
"loss": 0.3564,
"reward": 4.40625,
"reward_std": 2.274537533521652,
"rewards/accuracy_reward_staging": 0.3046875,
"rewards/format_reward": 0.671875,
"rewards/format_reward_staging": 0.6875,
"step": 391
},
{
"clip_ratio": 0.0,
"completion_length": 89.9375,
"epoch": 196.0,
"grad_norm": 37.59210149762812,
"kl": 2.451171875,
"learning_rate": 2.4359497401758026e-08,
"loss": 0.3768,
"reward": 5.59375,
"reward_std": 2.1195763051509857,
"rewards/accuracy_reward_staging": 0.421875,
"rewards/format_reward": 0.671875,
"rewards/format_reward_staging": 0.703125,
"step": 392
},
{
"clip_ratio": 0.0,
"completion_length": 249.640625,
"epoch": 196.5,
"grad_norm": 10.008461927031737,
"kl": 10.494140625,
"learning_rate": 1.86520157813308e-08,
"loss": 0.147,
"reward": 2.078125,
"reward_std": 1.580733835697174,
"rewards/accuracy_reward_staging": 0.109375,
"rewards/format_reward": 0.484375,
"rewards/format_reward_staging": 0.5,
"step": 393
},
{
"clip_ratio": 0.0,
"completion_length": 70.09375,
"epoch": 197.0,
"grad_norm": 45.873680058130404,
"kl": 2.017578125,
"learning_rate": 1.370465245426167e-08,
"loss": 0.7853,
"reward": 5.8125,
"reward_std": 1.908312439918518,
"rewards/accuracy_reward_staging": 0.390625,
"rewards/format_reward": 0.953125,
"rewards/format_reward_staging": 0.953125,
"step": 394
},
{
"clip_ratio": 0.0,
"completion_length": 75.359375,
"epoch": 197.5,
"grad_norm": 8.377325192503205,
"kl": 3.708984375,
"learning_rate": 9.517784181422018e-09,
"loss": 0.191,
"reward": 2.875,
"reward_std": 2.159477174282074,
"rewards/accuracy_reward_staging": 0.109375,
"rewards/format_reward": 0.890625,
"rewards/format_reward_staging": 0.890625,
"step": 395
},
{
"clip_ratio": 0.0,
"completion_length": 283.65625,
"epoch": 198.0,
"grad_norm": 6.244882431212038,
"kl": 2.7890625,
"learning_rate": 6.091729809042379e-09,
"loss": 0.1084,
"reward": 5.40625,
"reward_std": 2.89695280790329,
"rewards/accuracy_reward_staging": 0.4375,
"rewards/format_reward": 0.515625,
"rewards/format_reward_staging": 0.515625,
"step": 396
},
{
"clip_ratio": 0.0,
"completion_length": 106.40625,
"epoch": 198.5,
"grad_norm": 38.143536821255225,
"kl": 2.380859375,
"learning_rate": 3.4267502444274013e-09,
"loss": 0.6792,
"reward": 6.21875,
"reward_std": 3.516386479139328,
"rewards/accuracy_reward_staging": 0.4375,
"rewards/format_reward": 0.90625,
"rewards/format_reward_staging": 0.9375,
"step": 397
},
{
"clip_ratio": 0.0,
"completion_length": 156.4375,
"epoch": 199.0,
"grad_norm": 14.86370731226592,
"kl": 3.87890625,
"learning_rate": 1.5230484360873043e-09,
"loss": 0.1544,
"reward": 1.0625,
"reward_std": 0.9951936304569244,
"rewards/accuracy_reward_staging": 0.015625,
"rewards/format_reward": 0.4375,
"rewards/format_reward_staging": 0.46875,
"step": 398
},
{
"clip_ratio": 0.0,
"completion_length": 68.96875,
"epoch": 199.5,
"grad_norm": 3.6909839942424423,
"kl": 2.57421875,
"learning_rate": 3.807693582869032e-10,
"loss": -0.0079,
"reward": 3.234375,
"reward_std": 2.6320499926805496,
"rewards/accuracy_reward_staging": 0.140625,
"rewards/format_reward": 0.921875,
"rewards/format_reward_staging": 0.90625,
"step": 399
},
{
"epoch": 200.0,
"grad_norm": 29.889890508013348,
"learning_rate": 0.0,
"loss": 0.6276,
"step": 400
},
{
"epoch": 200.0,
"eval_clip_ratio": 0.0,
"eval_completion_length": 155.21875,
"eval_kl": 4.8115234375,
"eval_loss": 0.44944292306900024,
"eval_reward": 4.1171875,
"eval_reward_std": 2.682509124279022,
"eval_rewards/accuracy_reward_staging": 0.28125,
"eval_rewards/format_reward": 0.6328125,
"eval_rewards/format_reward_staging": 0.671875,
"eval_runtime": 36.2396,
"eval_samples_per_second": 0.221,
"eval_steps_per_second": 0.028,
"step": 400
},
{
"clip_ratio": 0.0,
"completion_length": 306.3125,
"epoch": 200.0,
"kl": 5.7890625,
"reward": 3.8125,
"reward_std": 2.403294324874878,
"rewards/accuracy_reward_staging": 0.296875,
"rewards/format_reward": 0.421875,
"rewards/format_reward_staging": 0.421875,
"step": 400,
"total_flos": 0.0,
"train_loss": 0.7732266664505005,
"train_runtime": 7747.2937,
"train_samples_per_second": 0.207,
"train_steps_per_second": 0.052
}
],
"logging_steps": 1,
"max_steps": 400,
"num_input_tokens_seen": 0,
"num_train_epochs": 200,
"save_steps": 40,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}