| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 200.0, |
| "eval_steps": 10, |
| "global_step": 400, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 459.375, |
| "epoch": 0.5, |
| "grad_norm": 2.876323361721518, |
| "kl": 0.0, |
| "learning_rate": 5.000000000000001e-07, |
| "loss": 0.3004, |
| "reward": 0.65625, |
| "reward_std": 0.8805903792381287, |
| "rewards/accuracy_reward_staging": 0.0, |
| "rewards/format_reward": 0.28125, |
| "rewards/format_reward_staging": 0.375, |
| "step": 1 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 472.78125, |
| "epoch": 1.0, |
| "grad_norm": 2.1678805572948407, |
| "kl": 0.0, |
| "learning_rate": 1.0000000000000002e-06, |
| "loss": 0.2723, |
| "reward": 0.859375, |
| "reward_std": 0.8921410292387009, |
| "rewards/accuracy_reward_staging": 0.0, |
| "rewards/format_reward": 0.34375, |
| "rewards/format_reward_staging": 0.515625, |
| "step": 2 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 402.53125, |
| "epoch": 1.5, |
| "grad_norm": 2.685905075565771, |
| "kl": 0.000606536865234375, |
| "learning_rate": 1.5e-06, |
| "loss": 0.2138, |
| "reward": 0.890625, |
| "reward_std": 0.9797716289758682, |
| "rewards/accuracy_reward_staging": 0.0, |
| "rewards/format_reward": 0.421875, |
| "rewards/format_reward_staging": 0.46875, |
| "step": 3 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 435.40625, |
| "epoch": 2.0, |
| "grad_norm": 2.4665649457110215, |
| "kl": 0.0006070137023925781, |
| "learning_rate": 2.0000000000000003e-06, |
| "loss": 0.2588, |
| "reward": 0.84375, |
| "reward_std": 1.4599270820617676, |
| "rewards/accuracy_reward_staging": 0.015625, |
| "rewards/format_reward": 0.296875, |
| "rewards/format_reward_staging": 0.390625, |
| "step": 4 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 441.5, |
| "epoch": 2.5, |
| "grad_norm": 2.7610545552229273, |
| "kl": 0.0015697479248046875, |
| "learning_rate": 2.5e-06, |
| "loss": 0.3144, |
| "reward": 0.875, |
| "reward_std": 0.8861797749996185, |
| "rewards/accuracy_reward_staging": 0.0, |
| "rewards/format_reward": 0.375, |
| "rewards/format_reward_staging": 0.5, |
| "step": 5 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 538.15625, |
| "epoch": 3.0, |
| "grad_norm": 1.9363663555232191, |
| "kl": 0.006195068359375, |
| "learning_rate": 3e-06, |
| "loss": 0.208, |
| "reward": 0.734375, |
| "reward_std": 0.9395850598812103, |
| "rewards/accuracy_reward_staging": 0.0, |
| "rewards/format_reward": 0.328125, |
| "rewards/format_reward_staging": 0.40625, |
| "step": 6 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 438.53125, |
| "epoch": 3.5, |
| "grad_norm": 40.059369045816226, |
| "kl": 0.00463104248046875, |
| "learning_rate": 3.5e-06, |
| "loss": 0.2105, |
| "reward": 0.75, |
| "reward_std": 0.8607002794742584, |
| "rewards/accuracy_reward_staging": 0.0, |
| "rewards/format_reward": 0.28125, |
| "rewards/format_reward_staging": 0.46875, |
| "step": 7 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 444.6875, |
| "epoch": 4.0, |
| "grad_norm": 2.042344469155283, |
| "kl": 0.0260009765625, |
| "learning_rate": 4.000000000000001e-06, |
| "loss": 0.2712, |
| "reward": 0.96875, |
| "reward_std": 0.9265350848436356, |
| "rewards/accuracy_reward_staging": 0.0, |
| "rewards/format_reward": 0.390625, |
| "rewards/format_reward_staging": 0.578125, |
| "step": 8 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 436.828125, |
| "epoch": 4.5, |
| "grad_norm": 1.8209399338820504, |
| "kl": 0.029571533203125, |
| "learning_rate": 4.5e-06, |
| "loss": 0.3164, |
| "reward": 1.0, |
| "reward_std": 0.9064923822879791, |
| "rewards/accuracy_reward_staging": 0.0, |
| "rewards/format_reward": 0.421875, |
| "rewards/format_reward_staging": 0.578125, |
| "step": 9 |
| }, |
| { |
| "epoch": 5.0, |
| "grad_norm": 2.6748764898489186, |
| "learning_rate": 5e-06, |
| "loss": 0.3694, |
| "step": 10 |
| }, |
| { |
| "epoch": 5.0, |
| "eval_clip_ratio": 0.0, |
| "eval_completion_length": 323.546875, |
| "eval_kl": 0.1407470703125, |
| "eval_loss": 0.21327295899391174, |
| "eval_reward": 1.625, |
| "eval_reward_std": 0.6777683570981026, |
| "eval_rewards/accuracy_reward_staging": 0.0, |
| "eval_rewards/format_reward": 0.78125, |
| "eval_rewards/format_reward_staging": 0.84375, |
| "eval_runtime": 40.7929, |
| "eval_samples_per_second": 0.196, |
| "eval_steps_per_second": 0.025, |
| "step": 10 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 387.734375, |
| "epoch": 5.5, |
| "grad_norm": 3.5720394192584792, |
| "kl": 0.1397705078125, |
| "learning_rate": 5.500000000000001e-06, |
| "loss": 0.4228, |
| "reward": 1.421875, |
| "reward_std": 1.1599705293774605, |
| "rewards/accuracy_reward_staging": 0.0078125, |
| "rewards/format_reward": 0.6484375, |
| "rewards/format_reward_staging": 0.6953125, |
| "step": 11 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 331.90625, |
| "epoch": 6.0, |
| "grad_norm": 3.665359231516559, |
| "kl": 0.20849609375, |
| "learning_rate": 6e-06, |
| "loss": 0.5381, |
| "reward": 1.640625, |
| "reward_std": 0.5743362456560135, |
| "rewards/accuracy_reward_staging": 0.0, |
| "rewards/format_reward": 0.734375, |
| "rewards/format_reward_staging": 0.90625, |
| "step": 12 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 303.671875, |
| "epoch": 6.5, |
| "grad_norm": 2.5383946033246154, |
| "kl": 0.239501953125, |
| "learning_rate": 6.5000000000000004e-06, |
| "loss": 0.2558, |
| "reward": 1.921875, |
| "reward_std": 1.0853875279426575, |
| "rewards/accuracy_reward_staging": 0.015625, |
| "rewards/format_reward": 0.859375, |
| "rewards/format_reward_staging": 0.90625, |
| "step": 13 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 253.375, |
| "epoch": 7.0, |
| "grad_norm": 2.9224354012117546, |
| "kl": 0.36865234375, |
| "learning_rate": 7e-06, |
| "loss": 0.1188, |
| "reward": 1.90625, |
| "reward_std": 0.3234764039516449, |
| "rewards/accuracy_reward_staging": 0.0, |
| "rewards/format_reward": 0.9375, |
| "rewards/format_reward_staging": 0.96875, |
| "step": 14 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 229.6875, |
| "epoch": 7.5, |
| "grad_norm": 134.13549246823817, |
| "kl": 2.4775390625, |
| "learning_rate": 7.500000000000001e-06, |
| "loss": 0.25, |
| "reward": 1.90625, |
| "reward_std": 0.29578250646591187, |
| "rewards/accuracy_reward_staging": 0.0, |
| "rewards/format_reward": 0.9375, |
| "rewards/format_reward_staging": 0.96875, |
| "step": 15 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 201.375, |
| "epoch": 8.0, |
| "grad_norm": 3.387910323346065, |
| "kl": 0.7109375, |
| "learning_rate": 8.000000000000001e-06, |
| "loss": 0.0706, |
| "reward": 1.9375, |
| "reward_std": 0.11180339753627777, |
| "rewards/accuracy_reward_staging": 0.0, |
| "rewards/format_reward": 0.9375, |
| "rewards/format_reward_staging": 1.0, |
| "step": 16 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 244.984375, |
| "epoch": 8.5, |
| "grad_norm": 4.308168061347317, |
| "kl": 0.810546875, |
| "learning_rate": 8.5e-06, |
| "loss": 0.0776, |
| "reward": 1.984375, |
| "reward_std": 0.0625, |
| "rewards/accuracy_reward_staging": 0.0, |
| "rewards/format_reward": 0.984375, |
| "rewards/format_reward_staging": 1.0, |
| "step": 17 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 211.59375, |
| "epoch": 9.0, |
| "grad_norm": 2.9369254269642333, |
| "kl": 1.34228515625, |
| "learning_rate": 9e-06, |
| "loss": -0.0485, |
| "reward": 1.921875, |
| "reward_std": 0.11967839300632477, |
| "rewards/accuracy_reward_staging": 0.0, |
| "rewards/format_reward": 0.9375, |
| "rewards/format_reward_staging": 0.984375, |
| "step": 18 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 313.828125, |
| "epoch": 9.5, |
| "grad_norm": 1.4789470328059777, |
| "kl": 0.5146484375, |
| "learning_rate": 9.5e-06, |
| "loss": 0.0547, |
| "reward": 1.96875, |
| "reward_std": 0.08539125323295593, |
| "rewards/accuracy_reward_staging": 0.0, |
| "rewards/format_reward": 0.984375, |
| "rewards/format_reward_staging": 0.984375, |
| "step": 19 |
| }, |
| { |
| "epoch": 10.0, |
| "grad_norm": 80.10476715951108, |
| "learning_rate": 1e-05, |
| "loss": 0.0398, |
| "step": 20 |
| }, |
| { |
| "epoch": 10.0, |
| "eval_clip_ratio": 0.0, |
| "eval_completion_length": 278.5546875, |
| "eval_kl": 0.609130859375, |
| "eval_loss": -0.0038001190405339003, |
| "eval_reward": 2.1015625, |
| "eval_reward_std": 0.5469204634428024, |
| "eval_rewards/accuracy_reward_staging": 0.015625, |
| "eval_rewards/format_reward": 0.953125, |
| "eval_rewards/format_reward_staging": 0.9921875, |
| "eval_runtime": 30.1338, |
| "eval_samples_per_second": 0.265, |
| "eval_steps_per_second": 0.033, |
| "step": 20 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 268.6953125, |
| "epoch": 10.5, |
| "grad_norm": 2.0467738114975678, |
| "kl": 4.900390625, |
| "learning_rate": 1.0500000000000001e-05, |
| "loss": 0.0361, |
| "reward": 2.25, |
| "reward_std": 1.1723129898309708, |
| "rewards/accuracy_reward_staging": 0.03125, |
| "rewards/format_reward": 0.9453125, |
| "rewards/format_reward_staging": 0.9921875, |
| "step": 21 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 304.1875, |
| "epoch": 11.0, |
| "grad_norm": 1.8964954924900732, |
| "kl": 0.7880859375, |
| "learning_rate": 1.1000000000000001e-05, |
| "loss": -0.1894, |
| "reward": 2.078125, |
| "reward_std": 0.8426409065723419, |
| "rewards/accuracy_reward_staging": 0.015625, |
| "rewards/format_reward": 0.921875, |
| "rewards/format_reward_staging": 1.0, |
| "step": 22 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 266.6875, |
| "epoch": 11.5, |
| "grad_norm": 1.6929978014913423, |
| "kl": 0.779296875, |
| "learning_rate": 1.15e-05, |
| "loss": -0.0221, |
| "reward": 1.90625, |
| "reward_std": 0.24866947531700134, |
| "rewards/accuracy_reward_staging": 0.0, |
| "rewards/format_reward": 0.90625, |
| "rewards/format_reward_staging": 1.0, |
| "step": 23 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 259.125, |
| "epoch": 12.0, |
| "grad_norm": 1.7383655787319439, |
| "kl": 0.7666015625, |
| "learning_rate": 1.2e-05, |
| "loss": 0.0302, |
| "reward": 3.0625, |
| "reward_std": 3.1296846866607666, |
| "rewards/accuracy_reward_staging": 0.109375, |
| "rewards/format_reward": 0.96875, |
| "rewards/format_reward_staging": 1.0, |
| "step": 24 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 233.625, |
| "epoch": 12.5, |
| "grad_norm": 4.648191964945557, |
| "kl": 1.6943359375, |
| "learning_rate": 1.25e-05, |
| "loss": -0.0409, |
| "reward": 2.21875, |
| "reward_std": 1.4246117174625397, |
| "rewards/accuracy_reward_staging": 0.03125, |
| "rewards/format_reward": 0.9375, |
| "rewards/format_reward_staging": 0.96875, |
| "step": 25 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 239.625, |
| "epoch": 13.0, |
| "grad_norm": 1.4292059587187729, |
| "kl": 0.57421875, |
| "learning_rate": 1.3000000000000001e-05, |
| "loss": -0.0077, |
| "reward": 2.734375, |
| "reward_std": 1.3342310190200806, |
| "rewards/accuracy_reward_staging": 0.078125, |
| "rewards/format_reward": 0.96875, |
| "rewards/format_reward_staging": 0.984375, |
| "step": 26 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 223.390625, |
| "epoch": 13.5, |
| "grad_norm": 2.513864581190583, |
| "kl": 1.228515625, |
| "learning_rate": 1.3500000000000001e-05, |
| "loss": 0.0735, |
| "reward": 2.921875, |
| "reward_std": 2.031271666288376, |
| "rewards/accuracy_reward_staging": 0.109375, |
| "rewards/format_reward": 0.859375, |
| "rewards/format_reward_staging": 0.96875, |
| "step": 27 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 221.65625, |
| "epoch": 14.0, |
| "grad_norm": 1.5115163324009253, |
| "kl": 0.724609375, |
| "learning_rate": 1.4e-05, |
| "loss": 0.0464, |
| "reward": 2.078125, |
| "reward_std": 0.6708659529685974, |
| "rewards/accuracy_reward_staging": 0.015625, |
| "rewards/format_reward": 0.9375, |
| "rewards/format_reward_staging": 0.984375, |
| "step": 28 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 269.03125, |
| "epoch": 14.5, |
| "grad_norm": 1.7257409221598474, |
| "kl": 0.5712890625, |
| "learning_rate": 1.45e-05, |
| "loss": -0.0904, |
| "reward": 3.34375, |
| "reward_std": 2.8355378210544586, |
| "rewards/accuracy_reward_staging": 0.140625, |
| "rewards/format_reward": 0.9375, |
| "rewards/format_reward_staging": 1.0, |
| "step": 29 |
| }, |
| { |
| "epoch": 15.0, |
| "grad_norm": 2.5117601129661438, |
| "learning_rate": 1.5000000000000002e-05, |
| "loss": -0.0204, |
| "step": 30 |
| }, |
| { |
| "epoch": 15.0, |
| "eval_clip_ratio": 0.0, |
| "eval_completion_length": 292.3984375, |
| "eval_kl": 0.724365234375, |
| "eval_loss": 0.037196554243564606, |
| "eval_reward": 3.015625, |
| "eval_reward_std": 2.8644309490919113, |
| "eval_rewards/accuracy_reward_staging": 0.1171875, |
| "eval_rewards/format_reward": 0.8984375, |
| "eval_rewards/format_reward_staging": 0.9453125, |
| "eval_runtime": 35.4144, |
| "eval_samples_per_second": 0.226, |
| "eval_steps_per_second": 0.028, |
| "step": 30 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 258.734375, |
| "epoch": 15.5, |
| "grad_norm": 2.0989197265306423, |
| "kl": 0.6796875, |
| "learning_rate": 1.55e-05, |
| "loss": -0.1012, |
| "reward": 2.5390625, |
| "reward_std": 1.8101423382759094, |
| "rewards/accuracy_reward_staging": 0.0625, |
| "rewards/format_reward": 0.921875, |
| "rewards/format_reward_staging": 0.9921875, |
| "step": 31 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 298.40625, |
| "epoch": 16.0, |
| "grad_norm": 1.1476664560432441, |
| "kl": 0.611328125, |
| "learning_rate": 1.6000000000000003e-05, |
| "loss": -0.0672, |
| "reward": 2.28125, |
| "reward_std": 0.93930384516716, |
| "rewards/accuracy_reward_staging": 0.03125, |
| "rewards/format_reward": 0.96875, |
| "rewards/format_reward_staging": 1.0, |
| "step": 32 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 268.296875, |
| "epoch": 16.5, |
| "grad_norm": 19.21670679033428, |
| "kl": 1.50927734375, |
| "learning_rate": 1.65e-05, |
| "loss": 0.0674, |
| "reward": 2.296875, |
| "reward_std": 1.257249653339386, |
| "rewards/accuracy_reward_staging": 0.03125, |
| "rewards/format_reward": 0.984375, |
| "rewards/format_reward_staging": 1.0, |
| "step": 33 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 282.3125, |
| "epoch": 17.0, |
| "grad_norm": 1.4246525591844976, |
| "kl": 0.56787109375, |
| "learning_rate": 1.7e-05, |
| "loss": -0.0194, |
| "reward": 3.8125, |
| "reward_std": 3.774241268634796, |
| "rewards/accuracy_reward_staging": 0.1875, |
| "rewards/format_reward": 0.953125, |
| "rewards/format_reward_staging": 0.984375, |
| "step": 34 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 319.765625, |
| "epoch": 17.5, |
| "grad_norm": 1.9762168820196657, |
| "kl": 0.77197265625, |
| "learning_rate": 1.7500000000000002e-05, |
| "loss": 0.2802, |
| "reward": 3.0, |
| "reward_std": 2.2321222722530365, |
| "rewards/accuracy_reward_staging": 0.109375, |
| "rewards/format_reward": 0.953125, |
| "rewards/format_reward_staging": 0.953125, |
| "step": 35 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 315.3125, |
| "epoch": 18.0, |
| "grad_norm": 1.342401716752108, |
| "kl": 0.7529296875, |
| "learning_rate": 1.8e-05, |
| "loss": 0.0864, |
| "reward": 3.796875, |
| "reward_std": 2.384265750646591, |
| "rewards/accuracy_reward_staging": 0.1875, |
| "rewards/format_reward": 0.9375, |
| "rewards/format_reward_staging": 0.984375, |
| "step": 36 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 180.671875, |
| "epoch": 18.5, |
| "grad_norm": 1.3852885333924034, |
| "kl": 0.7998046875, |
| "learning_rate": 1.8500000000000002e-05, |
| "loss": -0.0281, |
| "reward": 3.375, |
| "reward_std": 2.220172733068466, |
| "rewards/accuracy_reward_staging": 0.140625, |
| "rewards/format_reward": 1.0, |
| "rewards/format_reward_staging": 0.96875, |
| "step": 37 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 153.625, |
| "epoch": 19.0, |
| "grad_norm": 1.5314695103803033, |
| "kl": 0.89453125, |
| "learning_rate": 1.9e-05, |
| "loss": -0.001, |
| "reward": 2.921875, |
| "reward_std": 1.9802924394607544, |
| "rewards/accuracy_reward_staging": 0.09375, |
| "rewards/format_reward": 0.984375, |
| "rewards/format_reward_staging": 1.0, |
| "step": 38 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 187.234375, |
| "epoch": 19.5, |
| "grad_norm": 1.5329342983674943, |
| "kl": 0.796875, |
| "learning_rate": 1.95e-05, |
| "loss": 0.0871, |
| "reward": 2.40625, |
| "reward_std": 1.1022064685821533, |
| "rewards/accuracy_reward_staging": 0.046875, |
| "rewards/format_reward": 0.984375, |
| "rewards/format_reward_staging": 0.953125, |
| "step": 39 |
| }, |
| { |
| "epoch": 20.0, |
| "grad_norm": 1.80824958208745, |
| "learning_rate": 2e-05, |
| "loss": 0.0547, |
| "step": 40 |
| }, |
| { |
| "epoch": 20.0, |
| "eval_clip_ratio": 0.0, |
| "eval_completion_length": 199.2421875, |
| "eval_kl": 0.71337890625, |
| "eval_loss": 0.02822229452431202, |
| "eval_reward": 3.6640625, |
| "eval_reward_std": 2.5834601297974586, |
| "eval_rewards/accuracy_reward_staging": 0.1953125, |
| "eval_rewards/format_reward": 0.96875, |
| "eval_rewards/format_reward_staging": 0.7421875, |
| "eval_runtime": 28.7471, |
| "eval_samples_per_second": 0.278, |
| "eval_steps_per_second": 0.035, |
| "step": 40 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 253.6796875, |
| "epoch": 20.5, |
| "grad_norm": 2.494613965798382, |
| "kl": 0.66796875, |
| "learning_rate": 1.9999619230641714e-05, |
| "loss": -0.083, |
| "reward": 4.09375, |
| "reward_std": 2.9600732252001762, |
| "rewards/accuracy_reward_staging": 0.2265625, |
| "rewards/format_reward": 0.96875, |
| "rewards/format_reward_staging": 0.859375, |
| "step": 41 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 231.5625, |
| "epoch": 21.0, |
| "grad_norm": 1.5230693811732243, |
| "kl": 1.0322265625, |
| "learning_rate": 1.9998476951563914e-05, |
| "loss": 0.0046, |
| "reward": 3.453125, |
| "reward_std": 3.205719515681267, |
| "rewards/accuracy_reward_staging": 0.1875, |
| "rewards/format_reward": 0.890625, |
| "rewards/format_reward_staging": 0.6875, |
| "step": 42 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 278.203125, |
| "epoch": 21.5, |
| "grad_norm": 1.758870754503535, |
| "kl": 0.6474609375, |
| "learning_rate": 1.9996573249755573e-05, |
| "loss": -0.0531, |
| "reward": 3.65625, |
| "reward_std": 2.5534728318452835, |
| "rewards/accuracy_reward_staging": 0.203125, |
| "rewards/format_reward": 0.90625, |
| "rewards/format_reward_staging": 0.71875, |
| "step": 43 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 275.15625, |
| "epoch": 22.0, |
| "grad_norm": 2.0919325244919147, |
| "kl": 0.9189453125, |
| "learning_rate": 1.999390827019096e-05, |
| "loss": 0.0602, |
| "reward": 3.75, |
| "reward_std": 2.127801224589348, |
| "rewards/accuracy_reward_staging": 0.203125, |
| "rewards/format_reward": 0.890625, |
| "rewards/format_reward_staging": 0.828125, |
| "step": 44 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 328.328125, |
| "epoch": 22.5, |
| "grad_norm": 1.5048949701874923, |
| "kl": 0.705078125, |
| "learning_rate": 1.999048221581858e-05, |
| "loss": -0.0272, |
| "reward": 4.140625, |
| "reward_std": 2.4673196375370026, |
| "rewards/accuracy_reward_staging": 0.234375, |
| "rewards/format_reward": 0.9375, |
| "rewards/format_reward_staging": 0.859375, |
| "step": 45 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 430.84375, |
| "epoch": 23.0, |
| "grad_norm": 5.30560128522589, |
| "kl": 0.68798828125, |
| "learning_rate": 1.9986295347545738e-05, |
| "loss": 0.1258, |
| "reward": 4.0, |
| "reward_std": 1.9574655294418335, |
| "rewards/accuracy_reward_staging": 0.21875, |
| "rewards/format_reward": 0.921875, |
| "rewards/format_reward_staging": 0.890625, |
| "step": 46 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 447.90625, |
| "epoch": 23.5, |
| "grad_norm": 1.5599503211998336, |
| "kl": 0.6640625, |
| "learning_rate": 1.998134798421867e-05, |
| "loss": 0.0862, |
| "reward": 3.53125, |
| "reward_std": 2.2529123574495316, |
| "rewards/accuracy_reward_staging": 0.1875, |
| "rewards/format_reward": 0.796875, |
| "rewards/format_reward_staging": 0.859375, |
| "step": 47 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 517.34375, |
| "epoch": 24.0, |
| "grad_norm": 1.2228825438248228, |
| "kl": 0.6796875, |
| "learning_rate": 1.9975640502598243e-05, |
| "loss": 0.0315, |
| "reward": 2.96875, |
| "reward_std": 2.65271133184433, |
| "rewards/accuracy_reward_staging": 0.171875, |
| "rewards/format_reward": 0.453125, |
| "rewards/format_reward_staging": 0.796875, |
| "step": 48 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 598.65625, |
| "epoch": 24.5, |
| "grad_norm": 1.146901459334409, |
| "kl": 0.6064453125, |
| "learning_rate": 1.9969173337331283e-05, |
| "loss": 0.1387, |
| "reward": 1.859375, |
| "reward_std": 1.7595358788967133, |
| "rewards/accuracy_reward_staging": 0.0625, |
| "rewards/format_reward": 0.453125, |
| "rewards/format_reward_staging": 0.78125, |
| "step": 49 |
| }, |
| { |
| "epoch": 25.0, |
| "grad_norm": 1.375237075929609, |
| "learning_rate": 1.9961946980917457e-05, |
| "loss": 0.156, |
| "step": 50 |
| }, |
| { |
| "epoch": 25.0, |
| "eval_clip_ratio": 0.0, |
| "eval_completion_length": 575.0234375, |
| "eval_kl": 1.018798828125, |
| "eval_loss": 0.19486893713474274, |
| "eval_reward": 2.0234375, |
| "eval_reward_std": 2.9213491678237915, |
| "eval_rewards/accuracy_reward_staging": 0.1015625, |
| "eval_rewards/format_reward": 0.3671875, |
| "eval_rewards/format_reward_staging": 0.640625, |
| "eval_runtime": 59.4705, |
| "eval_samples_per_second": 0.135, |
| "eval_steps_per_second": 0.017, |
| "step": 50 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 562.515625, |
| "epoch": 25.5, |
| "grad_norm": 188.4500979533979, |
| "kl": 2.618408203125, |
| "learning_rate": 1.9953961983671792e-05, |
| "loss": 0.5177, |
| "reward": 2.5, |
| "reward_std": 3.7192839682102203, |
| "rewards/accuracy_reward_staging": 0.1484375, |
| "rewards/format_reward": 0.3671875, |
| "rewards/format_reward_staging": 0.6484375, |
| "step": 51 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 571.78125, |
| "epoch": 26.0, |
| "grad_norm": 7356.139198346828, |
| "kl": 219.451171875, |
| "learning_rate": 1.9945218953682736e-05, |
| "loss": 12.1361, |
| "reward": 1.1875, |
| "reward_std": 2.103448197245598, |
| "rewards/accuracy_reward_staging": 0.046875, |
| "rewards/format_reward": 0.265625, |
| "rewards/format_reward_staging": 0.453125, |
| "step": 52 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 631.9375, |
| "epoch": 26.5, |
| "grad_norm": 58.63460971785782, |
| "kl": 2.384765625, |
| "learning_rate": 1.9935718556765878e-05, |
| "loss": 0.3151, |
| "reward": 0.625, |
| "reward_std": 1.280954971909523, |
| "rewards/accuracy_reward_staging": 0.015625, |
| "rewards/format_reward": 0.203125, |
| "rewards/format_reward_staging": 0.265625, |
| "step": 53 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 685.90625, |
| "epoch": 27.0, |
| "grad_norm": 6.628332912526643, |
| "kl": 1.361328125, |
| "learning_rate": 1.9925461516413224e-05, |
| "loss": 0.3519, |
| "reward": 0.515625, |
| "reward_std": 1.2405942231416702, |
| "rewards/accuracy_reward_staging": 0.015625, |
| "rewards/format_reward": 0.15625, |
| "rewards/format_reward_staging": 0.203125, |
| "step": 54 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 358.859375, |
| "epoch": 27.5, |
| "grad_norm": 3.8474726393370386, |
| "kl": 3.169921875, |
| "learning_rate": 1.9914448613738107e-05, |
| "loss": 0.339, |
| "reward": 0.78125, |
| "reward_std": 1.5188637673854828, |
| "rewards/accuracy_reward_staging": 0.03125, |
| "rewards/format_reward": 0.203125, |
| "rewards/format_reward_staging": 0.265625, |
| "step": 55 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 160.28125, |
| "epoch": 28.0, |
| "grad_norm": 1711.6146146132392, |
| "kl": 29.546875, |
| "learning_rate": 1.9902680687415704e-05, |
| "loss": 2.1619, |
| "reward": 1.15625, |
| "reward_std": 2.088463395833969, |
| "rewards/accuracy_reward_staging": 0.0625, |
| "rewards/format_reward": 0.21875, |
| "rewards/format_reward_staging": 0.3125, |
| "step": 56 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 204.78125, |
| "epoch": 28.5, |
| "grad_norm": 172.76666342508884, |
| "kl": 7.39453125, |
| "learning_rate": 1.989015863361917e-05, |
| "loss": 0.6565, |
| "reward": 1.3125, |
| "reward_std": 2.61825592815876, |
| "rewards/accuracy_reward_staging": 0.078125, |
| "rewards/format_reward": 0.25, |
| "rewards/format_reward_staging": 0.28125, |
| "step": 57 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 114.875, |
| "epoch": 29.0, |
| "grad_norm": 14.841168261507285, |
| "kl": 4.638671875, |
| "learning_rate": 1.9876883405951378e-05, |
| "loss": -0.094, |
| "reward": 0.78125, |
| "reward_std": 1.340459167957306, |
| "rewards/accuracy_reward_staging": 0.03125, |
| "rewards/format_reward": 0.234375, |
| "rewards/format_reward_staging": 0.234375, |
| "step": 58 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 159.546875, |
| "epoch": 29.5, |
| "grad_norm": 38.76716232869443, |
| "kl": 3.5234375, |
| "learning_rate": 1.9862856015372315e-05, |
| "loss": 0.0135, |
| "reward": 1.125, |
| "reward_std": 2.4588640481233597, |
| "rewards/accuracy_reward_staging": 0.0625, |
| "rewards/format_reward": 0.28125, |
| "rewards/format_reward_staging": 0.21875, |
| "step": 59 |
| }, |
| { |
| "epoch": 30.0, |
| "grad_norm": 7.631987331877285, |
| "learning_rate": 1.9848077530122083e-05, |
| "loss": 0.0661, |
| "step": 60 |
| }, |
| { |
| "epoch": 30.0, |
| "eval_clip_ratio": 0.0, |
| "eval_completion_length": 180.6640625, |
| "eval_kl": 40.0986328125, |
| "eval_loss": 4.4661993980407715, |
| "eval_reward": 2.296875, |
| "eval_reward_std": 2.8048948869109154, |
| "eval_rewards/accuracy_reward_staging": 0.125, |
| "eval_rewards/format_reward": 0.5, |
| "eval_rewards/format_reward_staging": 0.546875, |
| "eval_runtime": 26.742, |
| "eval_samples_per_second": 0.299, |
| "eval_steps_per_second": 0.037, |
| "step": 60 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 175.5703125, |
| "epoch": 30.5, |
| "grad_norm": 10.455818966130773, |
| "kl": 2.056640625, |
| "learning_rate": 1.983254907563955e-05, |
| "loss": 0.058, |
| "reward": 2.5390625, |
| "reward_std": 3.450487032532692, |
| "rewards/accuracy_reward_staging": 0.15625, |
| "rewards/format_reward": 0.4765625, |
| "rewards/format_reward_staging": 0.5, |
| "step": 61 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 196.15625, |
| "epoch": 31.0, |
| "grad_norm": 40355.16362771545, |
| "kl": 549.29296875, |
| "learning_rate": 1.9816271834476642e-05, |
| "loss": 46.2265, |
| "reward": 1.53125, |
| "reward_std": 1.8515103608369827, |
| "rewards/accuracy_reward_staging": 0.046875, |
| "rewards/format_reward": 0.46875, |
| "rewards/format_reward_staging": 0.59375, |
| "step": 62 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 136.71875, |
| "epoch": 31.5, |
| "grad_norm": 13611.623138416926, |
| "kl": 673.318359375, |
| "learning_rate": 1.9799247046208297e-05, |
| "loss": 49.8755, |
| "reward": 2.5, |
| "reward_std": 3.039003312587738, |
| "rewards/accuracy_reward_staging": 0.140625, |
| "rewards/format_reward": 0.515625, |
| "rewards/format_reward_staging": 0.578125, |
| "step": 63 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 177.1875, |
| "epoch": 32.0, |
| "grad_norm": 30.436611541482517, |
| "kl": 1.78515625, |
| "learning_rate": 1.9781476007338058e-05, |
| "loss": 0.2471, |
| "reward": 4.484375, |
| "reward_std": 3.2696904987096786, |
| "rewards/accuracy_reward_staging": 0.3125, |
| "rewards/format_reward": 0.703125, |
| "rewards/format_reward_staging": 0.65625, |
| "step": 64 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 146.578125, |
| "epoch": 32.5, |
| "grad_norm": 53.62802825228914, |
| "kl": 1.798828125, |
| "learning_rate": 1.9762960071199334e-05, |
| "loss": 0.1721, |
| "reward": 3.953125, |
| "reward_std": 3.039620280265808, |
| "rewards/accuracy_reward_staging": 0.25, |
| "rewards/format_reward": 0.734375, |
| "rewards/format_reward_staging": 0.71875, |
| "step": 65 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 140.65625, |
| "epoch": 33.0, |
| "grad_norm": 25.346908056620485, |
| "kl": 2.13671875, |
| "learning_rate": 1.9743700647852356e-05, |
| "loss": 0.1572, |
| "reward": 3.59375, |
| "reward_std": 2.9897230714559555, |
| "rewards/accuracy_reward_staging": 0.21875, |
| "rewards/format_reward": 0.6875, |
| "rewards/format_reward_staging": 0.71875, |
| "step": 66 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 118.59375, |
| "epoch": 33.5, |
| "grad_norm": 27.454135887691468, |
| "kl": 2.25390625, |
| "learning_rate": 1.9723699203976768e-05, |
| "loss": -0.0063, |
| "reward": 1.875, |
| "reward_std": 1.3994054794311523, |
| "rewards/accuracy_reward_staging": 0.03125, |
| "rewards/format_reward": 0.765625, |
| "rewards/format_reward_staging": 0.796875, |
| "step": 67 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 94.875, |
| "epoch": 34.0, |
| "grad_norm": 2.334869737314596, |
| "kl": 1.462890625, |
| "learning_rate": 1.9702957262759964e-05, |
| "loss": 0.0725, |
| "reward": 5.890625, |
| "reward_std": 3.7823618352413177, |
| "rewards/accuracy_reward_staging": 0.40625, |
| "rewards/format_reward": 0.9375, |
| "rewards/format_reward_staging": 0.890625, |
| "step": 68 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 100.140625, |
| "epoch": 34.5, |
| "grad_norm": 15.721464309872673, |
| "kl": 3.056640625, |
| "learning_rate": 1.968147640378108e-05, |
| "loss": -0.0322, |
| "reward": 5.96875, |
| "reward_std": 3.234290450811386, |
| "rewards/accuracy_reward_staging": 0.4375, |
| "rewards/format_reward": 0.78125, |
| "rewards/format_reward_staging": 0.8125, |
| "step": 69 |
| }, |
| { |
| "epoch": 35.0, |
| "grad_norm": 57428.88495542333, |
| "learning_rate": 1.9659258262890683e-05, |
| "loss": 54.082, |
| "step": 70 |
| }, |
| { |
| "epoch": 35.0, |
| "eval_clip_ratio": 0.0, |
| "eval_completion_length": 96.375, |
| "eval_kl": 2.8583984375, |
| "eval_loss": 0.3439752459526062, |
| "eval_reward": 3.6953125, |
| "eval_reward_std": 2.673917531967163, |
| "eval_rewards/accuracy_reward_staging": 0.2109375, |
| "eval_rewards/format_reward": 0.7890625, |
| "eval_rewards/format_reward_staging": 0.796875, |
| "eval_runtime": 18.4421, |
| "eval_samples_per_second": 0.434, |
| "eval_steps_per_second": 0.054, |
| "step": 70 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 107.3515625, |
| "epoch": 35.5, |
| "grad_norm": 6.629087724584887, |
| "kl": 291.8291015625, |
| "learning_rate": 1.963630453208623e-05, |
| "loss": 0.0924, |
| "reward": 3.15625, |
| "reward_std": 2.308130495250225, |
| "rewards/accuracy_reward_staging": 0.15625, |
| "rewards/format_reward": 0.8125, |
| "rewards/format_reward_staging": 0.78125, |
| "step": 71 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 84.40625, |
| "epoch": 36.0, |
| "grad_norm": 7.593042467965846, |
| "kl": 3.369140625, |
| "learning_rate": 1.961261695938319e-05, |
| "loss": -0.1438, |
| "reward": 3.828125, |
| "reward_std": 2.8605447858572006, |
| "rewards/accuracy_reward_staging": 0.265625, |
| "rewards/format_reward": 0.609375, |
| "rewards/format_reward_staging": 0.5625, |
| "step": 72 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 91.109375, |
| "epoch": 36.5, |
| "grad_norm": 15.85973287182394, |
| "kl": 2.373046875, |
| "learning_rate": 1.958819734868193e-05, |
| "loss": 0.1167, |
| "reward": 5.375, |
| "reward_std": 3.426166355609894, |
| "rewards/accuracy_reward_staging": 0.359375, |
| "rewards/format_reward": 0.890625, |
| "rewards/format_reward_staging": 0.890625, |
| "step": 73 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 82.4375, |
| "epoch": 37.0, |
| "grad_norm": 2.8730940159336003, |
| "kl": 1.78125, |
| "learning_rate": 1.9563047559630356e-05, |
| "loss": -0.0405, |
| "reward": 3.078125, |
| "reward_std": 1.6442697197198868, |
| "rewards/accuracy_reward_staging": 0.125, |
| "rewards/format_reward": 0.96875, |
| "rewards/format_reward_staging": 0.859375, |
| "step": 74 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 110.8125, |
| "epoch": 37.5, |
| "grad_norm": 2.084130828938957, |
| "kl": 1.671875, |
| "learning_rate": 1.953716950748227e-05, |
| "loss": 0.0182, |
| "reward": 4.046875, |
| "reward_std": 2.878495067358017, |
| "rewards/accuracy_reward_staging": 0.234375, |
| "rewards/format_reward": 0.890625, |
| "rewards/format_reward_staging": 0.8125, |
| "step": 75 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 153.84375, |
| "epoch": 38.0, |
| "grad_norm": 3.91268521038018, |
| "kl": 1.5625, |
| "learning_rate": 1.9510565162951538e-05, |
| "loss": 0.2027, |
| "reward": 3.46875, |
| "reward_std": 2.793519899249077, |
| "rewards/accuracy_reward_staging": 0.171875, |
| "rewards/format_reward": 0.875, |
| "rewards/format_reward_staging": 0.875, |
| "step": 76 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 131.390625, |
| "epoch": 38.5, |
| "grad_norm": 2.196193406623696, |
| "kl": 1.4453125, |
| "learning_rate": 1.9483236552061996e-05, |
| "loss": 0.1111, |
| "reward": 2.78125, |
| "reward_std": 2.516305774450302, |
| "rewards/accuracy_reward_staging": 0.109375, |
| "rewards/format_reward": 0.84375, |
| "rewards/format_reward_staging": 0.84375, |
| "step": 77 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 144.90625, |
| "epoch": 39.0, |
| "grad_norm": 1.8430147295312553, |
| "kl": 1.384765625, |
| "learning_rate": 1.945518575599317e-05, |
| "loss": 0.112, |
| "reward": 3.765625, |
| "reward_std": 2.567844718694687, |
| "rewards/accuracy_reward_staging": 0.1875, |
| "rewards/format_reward": 0.921875, |
| "rewards/format_reward_staging": 0.96875, |
| "step": 78 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 113.875, |
| "epoch": 39.5, |
| "grad_norm": 2.2632400272333797, |
| "kl": 1.80859375, |
| "learning_rate": 1.9426414910921785e-05, |
| "loss": 0.0539, |
| "reward": 2.671875, |
| "reward_std": 1.4072720408439636, |
| "rewards/accuracy_reward_staging": 0.078125, |
| "rewards/format_reward": 0.953125, |
| "rewards/format_reward_staging": 0.9375, |
| "step": 79 |
| }, |
| { |
| "epoch": 40.0, |
| "grad_norm": 13.05828881312249, |
| "learning_rate": 1.9396926207859085e-05, |
| "loss": 0.2618, |
| "step": 80 |
| }, |
| { |
| "epoch": 40.0, |
| "eval_clip_ratio": 0.0, |
| "eval_completion_length": 107.0625, |
| "eval_kl": 1.9775390625, |
| "eval_loss": 0.1502879410982132, |
| "eval_reward": 3.3671875, |
| "eval_reward_std": 2.961985230445862, |
| "eval_rewards/accuracy_reward_staging": 0.1875, |
| "eval_rewards/format_reward": 0.734375, |
| "eval_rewards/format_reward_staging": 0.7578125, |
| "eval_runtime": 17.3809, |
| "eval_samples_per_second": 0.46, |
| "eval_steps_per_second": 0.058, |
| "step": 80 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 142.5, |
| "epoch": 40.5, |
| "grad_norm": 3.1754965235193864, |
| "kl": 1.7373046875, |
| "learning_rate": 1.9366721892483976e-05, |
| "loss": 0.1444, |
| "reward": 3.125, |
| "reward_std": 2.8966881707310677, |
| "rewards/accuracy_reward_staging": 0.171875, |
| "rewards/format_reward": 0.6875, |
| "rewards/format_reward_staging": 0.71875, |
| "step": 81 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 86.5625, |
| "epoch": 41.0, |
| "grad_norm": 3.7587088555535186, |
| "kl": 1.984375, |
| "learning_rate": 1.9335804264972018e-05, |
| "loss": 0.2337, |
| "reward": 3.6875, |
| "reward_std": 3.980203613638878, |
| "rewards/accuracy_reward_staging": 0.234375, |
| "rewards/format_reward": 0.625, |
| "rewards/format_reward_staging": 0.71875, |
| "step": 82 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 135.828125, |
| "epoch": 41.5, |
| "grad_norm": 3.975006138810092, |
| "kl": 2.05859375, |
| "learning_rate": 1.9304175679820247e-05, |
| "loss": 0.2953, |
| "reward": 3.09375, |
| "reward_std": 3.4910158962011337, |
| "rewards/accuracy_reward_staging": 0.1875, |
| "rewards/format_reward": 0.5625, |
| "rewards/format_reward_staging": 0.65625, |
| "step": 83 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 82.1875, |
| "epoch": 42.0, |
| "grad_norm": 18.245540108535405, |
| "kl": 1.978515625, |
| "learning_rate": 1.9271838545667876e-05, |
| "loss": 0.1649, |
| "reward": 4.015625, |
| "reward_std": 2.856592908501625, |
| "rewards/accuracy_reward_staging": 0.21875, |
| "rewards/format_reward": 0.921875, |
| "rewards/format_reward_staging": 0.90625, |
| "step": 84 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 124.890625, |
| "epoch": 42.5, |
| "grad_norm": 24.523972833745187, |
| "kl": 8.154296875, |
| "learning_rate": 1.9238795325112867e-05, |
| "loss": 0.2386, |
| "reward": 3.65625, |
| "reward_std": 3.7115366458892822, |
| "rewards/accuracy_reward_staging": 0.21875, |
| "rewards/format_reward": 0.71875, |
| "rewards/format_reward_staging": 0.75, |
| "step": 85 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 96.125, |
| "epoch": 43.0, |
| "grad_norm": 6.021647719248748, |
| "kl": 1.91796875, |
| "learning_rate": 1.9205048534524405e-05, |
| "loss": 0.0759, |
| "reward": 2.75, |
| "reward_std": 1.662882000207901, |
| "rewards/accuracy_reward_staging": 0.125, |
| "rewards/format_reward": 0.71875, |
| "rewards/format_reward_staging": 0.78125, |
| "step": 86 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 135.796875, |
| "epoch": 43.5, |
| "grad_norm": 11.816137935249499, |
| "kl": 4.1640625, |
| "learning_rate": 1.917060074385124e-05, |
| "loss": 0.223, |
| "reward": 1.375, |
| "reward_std": 1.6268048882484436, |
| "rewards/accuracy_reward_staging": 0.03125, |
| "rewards/format_reward": 0.484375, |
| "rewards/format_reward_staging": 0.578125, |
| "step": 87 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 68.59375, |
| "epoch": 44.0, |
| "grad_norm": 14.46698321940086, |
| "kl": 1.978515625, |
| "learning_rate": 1.913545457642601e-05, |
| "loss": 0.0088, |
| "reward": 5.953125, |
| "reward_std": 2.1696823835372925, |
| "rewards/accuracy_reward_staging": 0.40625, |
| "rewards/format_reward": 0.9375, |
| "rewards/format_reward_staging": 0.953125, |
| "step": 88 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 68.234375, |
| "epoch": 44.5, |
| "grad_norm": 63.367821121151934, |
| "kl": 57.087890625, |
| "learning_rate": 1.9099612708765432e-05, |
| "loss": 0.3075, |
| "reward": 4.0625, |
| "reward_std": 0.23853857815265656, |
| "rewards/accuracy_reward_staging": 0.25, |
| "rewards/format_reward": 0.78125, |
| "rewards/format_reward_staging": 0.78125, |
| "step": 89 |
| }, |
| { |
| "epoch": 45.0, |
| "grad_norm": 11.140992168821672, |
| "learning_rate": 1.9063077870366504e-05, |
| "loss": 0.0056, |
| "step": 90 |
| }, |
| { |
| "epoch": 45.0, |
| "eval_clip_ratio": 0.0, |
| "eval_completion_length": 53.8046875, |
| "eval_kl": 3.3603515625, |
| "eval_loss": -0.141469344496727, |
| "eval_reward": 3.75, |
| "eval_reward_std": 0.6776039004325867, |
| "eval_rewards/accuracy_reward_staging": 0.2265625, |
| "eval_rewards/format_reward": 0.7265625, |
| "eval_rewards/format_reward_staging": 0.7578125, |
| "eval_runtime": 8.6597, |
| "eval_samples_per_second": 0.924, |
| "eval_steps_per_second": 0.115, |
| "step": 90 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 59.875, |
| "epoch": 45.5, |
| "grad_norm": 17.729188427124154, |
| "kl": 3.9931640625, |
| "learning_rate": 1.902585284349861e-05, |
| "loss": -0.0965, |
| "reward": 3.8828125, |
| "reward_std": 0.6357803642749786, |
| "rewards/accuracy_reward_staging": 0.2421875, |
| "rewards/format_reward": 0.71875, |
| "rewards/format_reward_staging": 0.7421875, |
| "step": 91 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 51.75, |
| "epoch": 46.0, |
| "grad_norm": 6.567024017689014, |
| "kl": 2.939453125, |
| "learning_rate": 1.8987940462991673e-05, |
| "loss": 0.0987, |
| "reward": 3.796875, |
| "reward_std": 0.7728912532329559, |
| "rewards/accuracy_reward_staging": 0.234375, |
| "rewards/format_reward": 0.75, |
| "rewards/format_reward_staging": 0.703125, |
| "step": 92 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 34.703125, |
| "epoch": 46.5, |
| "grad_norm": 2.7843419900288473, |
| "kl": 4.166015625, |
| "learning_rate": 1.894934361602025e-05, |
| "loss": 0.1104, |
| "reward": 3.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward_staging": 0.25, |
| "rewards/format_reward": 0.5, |
| "rewards/format_reward_staging": 0.5, |
| "step": 93 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 63.5, |
| "epoch": 47.0, |
| "grad_norm": 2.842446909471968, |
| "kl": 1.50390625, |
| "learning_rate": 1.891006524188368e-05, |
| "loss": 0.0614, |
| "reward": 4.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward_staging": 0.25, |
| "rewards/format_reward": 1.0, |
| "rewards/format_reward_staging": 1.0, |
| "step": 94 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 48.609375, |
| "epoch": 47.5, |
| "grad_norm": 16.935662051415687, |
| "kl": 3.41796875, |
| "learning_rate": 1.887010833178222e-05, |
| "loss": 0.0746, |
| "reward": 3.96875, |
| "reward_std": 0.08539125323295593, |
| "rewards/accuracy_reward_staging": 0.25, |
| "rewards/format_reward": 0.71875, |
| "rewards/format_reward_staging": 0.75, |
| "step": 95 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 48.40625, |
| "epoch": 48.0, |
| "grad_norm": 65.56995634930182, |
| "kl": 8.68359375, |
| "learning_rate": 1.8829475928589272e-05, |
| "loss": 0.2729, |
| "reward": 2.96875, |
| "reward_std": 1.9943940043449402, |
| "rewards/accuracy_reward_staging": 0.15625, |
| "rewards/format_reward": 0.6875, |
| "rewards/format_reward_staging": 0.71875, |
| "step": 96 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 53.921875, |
| "epoch": 48.5, |
| "grad_norm": 7.023283731196324, |
| "kl": 3.861328125, |
| "learning_rate": 1.8788171126619653e-05, |
| "loss": 0.0774, |
| "reward": 1.375, |
| "reward_std": 0.26598526537418365, |
| "rewards/accuracy_reward_staging": 0.0, |
| "rewards/format_reward": 0.6875, |
| "rewards/format_reward_staging": 0.6875, |
| "step": 97 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 53.40625, |
| "epoch": 49.0, |
| "grad_norm": 5.600387532938351, |
| "kl": 4.03515625, |
| "learning_rate": 1.874619707139396e-05, |
| "loss": 0.1163, |
| "reward": 1.84375, |
| "reward_std": 1.9289895445108414, |
| "rewards/accuracy_reward_staging": 0.0625, |
| "rewards/format_reward": 0.609375, |
| "rewards/format_reward_staging": 0.609375, |
| "step": 98 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 51.5625, |
| "epoch": 49.5, |
| "grad_norm": 546.222996523794, |
| "kl": 10.48046875, |
| "learning_rate": 1.8703556959398998e-05, |
| "loss": 0.2164, |
| "reward": 1.96875, |
| "reward_std": 1.679331585764885, |
| "rewards/accuracy_reward_staging": 0.078125, |
| "rewards/format_reward": 0.59375, |
| "rewards/format_reward_staging": 0.59375, |
| "step": 99 |
| }, |
| { |
| "epoch": 50.0, |
| "grad_norm": 6.92675967993841, |
| "learning_rate": 1.866025403784439e-05, |
| "loss": 0.0302, |
| "step": 100 |
| }, |
| { |
| "epoch": 50.0, |
| "eval_clip_ratio": 0.0, |
| "eval_completion_length": 53.1953125, |
| "eval_kl": 3.412109375, |
| "eval_loss": -0.0034832179080694914, |
| "eval_reward": 1.84375, |
| "eval_reward_std": 1.4304483458399773, |
| "eval_rewards/accuracy_reward_staging": 0.0546875, |
| "eval_rewards/format_reward": 0.6484375, |
| "eval_rewards/format_reward_staging": 0.6484375, |
| "eval_runtime": 8.882, |
| "eval_samples_per_second": 0.901, |
| "eval_steps_per_second": 0.113, |
| "step": 100 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 58.1484375, |
| "epoch": 50.5, |
| "grad_norm": 42.08311553908255, |
| "kl": 4.25390625, |
| "learning_rate": 1.861629160441526e-05, |
| "loss": 0.1744, |
| "reward": 1.890625, |
| "reward_std": 1.2476187869906425, |
| "rewards/accuracy_reward_staging": 0.0546875, |
| "rewards/format_reward": 0.671875, |
| "rewards/format_reward_staging": 0.671875, |
| "step": 101 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 57.875, |
| "epoch": 51.0, |
| "grad_norm": 14.483914466544725, |
| "kl": 3.54296875, |
| "learning_rate": 1.8571673007021124e-05, |
| "loss": 0.0479, |
| "reward": 2.515625, |
| "reward_std": 1.502477765083313, |
| "rewards/accuracy_reward_staging": 0.109375, |
| "rewards/format_reward": 0.71875, |
| "rewards/format_reward_staging": 0.703125, |
| "step": 102 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 77.078125, |
| "epoch": 51.5, |
| "grad_norm": 3.9386280468332933, |
| "kl": 1.173828125, |
| "learning_rate": 1.8526401643540924e-05, |
| "loss": 0.0927, |
| "reward": 3.34375, |
| "reward_std": 1.4447221755981445, |
| "rewards/accuracy_reward_staging": 0.140625, |
| "rewards/format_reward": 0.953125, |
| "rewards/format_reward_staging": 0.984375, |
| "step": 103 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 42.65625, |
| "epoch": 52.0, |
| "grad_norm": 5.22687443422562, |
| "kl": 3.640625, |
| "learning_rate": 1.848048096156426e-05, |
| "loss": 0.0985, |
| "reward": 3.1875, |
| "reward_std": 0.8539125919342041, |
| "rewards/accuracy_reward_staging": 0.21875, |
| "rewards/format_reward": 0.5, |
| "rewards/format_reward_staging": 0.5, |
| "step": 104 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 53.59375, |
| "epoch": 52.5, |
| "grad_norm": 4.578066958828574, |
| "kl": 2.548828125, |
| "learning_rate": 1.843391445812886e-05, |
| "loss": 0.0016, |
| "reward": 3.46875, |
| "reward_std": 2.002212718129158, |
| "rewards/accuracy_reward_staging": 0.21875, |
| "rewards/format_reward": 0.625, |
| "rewards/format_reward_staging": 0.65625, |
| "step": 105 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 57.5625, |
| "epoch": 53.0, |
| "grad_norm": 24.480999374163673, |
| "kl": 2.4599609375, |
| "learning_rate": 1.8386705679454243e-05, |
| "loss": -0.3046, |
| "reward": 3.96875, |
| "reward_std": 0.25, |
| "rewards/accuracy_reward_staging": 0.25, |
| "rewards/format_reward": 0.71875, |
| "rewards/format_reward_staging": 0.75, |
| "step": 106 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 39.734375, |
| "epoch": 53.5, |
| "grad_norm": 9.034066251198404, |
| "kl": 3.109375, |
| "learning_rate": 1.8338858220671683e-05, |
| "loss": -0.1409, |
| "reward": 0.9375, |
| "reward_std": 0.29578250646591187, |
| "rewards/accuracy_reward_staging": 0.0, |
| "rewards/format_reward": 0.46875, |
| "rewards/format_reward_staging": 0.46875, |
| "step": 107 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 69.5, |
| "epoch": 54.0, |
| "grad_norm": 2.4189570966395286, |
| "kl": 1.1376953125, |
| "learning_rate": 1.8290375725550417e-05, |
| "loss": -0.05, |
| "reward": 6.734375, |
| "reward_std": 2.646019369363785, |
| "rewards/accuracy_reward_staging": 0.484375, |
| "rewards/format_reward": 0.921875, |
| "rewards/format_reward_staging": 0.96875, |
| "step": 108 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 56.0, |
| "epoch": 54.5, |
| "grad_norm": 5.916855107370096, |
| "kl": 2.0869140625, |
| "learning_rate": 1.8241261886220155e-05, |
| "loss": 0.0611, |
| "reward": 4.578125, |
| "reward_std": 1.265925258398056, |
| "rewards/accuracy_reward_staging": 0.3125, |
| "rewards/format_reward": 0.71875, |
| "rewards/format_reward_staging": 0.734375, |
| "step": 109 |
| }, |
| { |
| "epoch": 55.0, |
| "grad_norm": 10.466101735080972, |
| "learning_rate": 1.819152044288992e-05, |
| "loss": 0.0845, |
| "step": 110 |
| }, |
| { |
| "epoch": 55.0, |
| "eval_clip_ratio": 0.0, |
| "eval_completion_length": 54.3359375, |
| "eval_kl": 1.70068359375, |
| "eval_loss": -0.013815220445394516, |
| "eval_reward": 3.875, |
| "eval_reward_std": 1.7073951363563538, |
| "eval_rewards/accuracy_reward_staging": 0.25, |
| "eval_rewards/format_reward": 0.6875, |
| "eval_rewards/format_reward_staging": 0.6875, |
| "eval_runtime": 8.9181, |
| "eval_samples_per_second": 0.897, |
| "eval_steps_per_second": 0.112, |
| "step": 110 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 62.453125, |
| "epoch": 55.5, |
| "grad_norm": 5.151983159259695, |
| "kl": 2.58154296875, |
| "learning_rate": 1.8141155183563195e-05, |
| "loss": -0.0532, |
| "reward": 3.125, |
| "reward_std": 1.16278538107872, |
| "rewards/accuracy_reward_staging": 0.1484375, |
| "rewards/format_reward": 0.8203125, |
| "rewards/format_reward_staging": 0.8203125, |
| "step": 111 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 40.09375, |
| "epoch": 56.0, |
| "grad_norm": 3.725576654068748, |
| "kl": 2.65234375, |
| "learning_rate": 1.8090169943749477e-05, |
| "loss": 0.0112, |
| "reward": 5.234375, |
| "reward_std": 1.9529178738594055, |
| "rewards/accuracy_reward_staging": 0.4375, |
| "rewards/format_reward": 0.40625, |
| "rewards/format_reward_staging": 0.453125, |
| "step": 112 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 68.09375, |
| "epoch": 56.5, |
| "grad_norm": 3.421963670561032, |
| "kl": 1.21875, |
| "learning_rate": 1.8038568606172172e-05, |
| "loss": -0.0952, |
| "reward": 4.09375, |
| "reward_std": 1.8953916430473328, |
| "rewards/accuracy_reward_staging": 0.234375, |
| "rewards/format_reward": 0.828125, |
| "rewards/format_reward_staging": 0.921875, |
| "step": 113 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 38.9375, |
| "epoch": 57.0, |
| "grad_norm": 19.54047147726119, |
| "kl": 3.12890625, |
| "learning_rate": 1.798635510047293e-05, |
| "loss": 0.0866, |
| "reward": 3.828125, |
| "reward_std": 1.6875420212745667, |
| "rewards/accuracy_reward_staging": 0.296875, |
| "rewards/format_reward": 0.359375, |
| "rewards/format_reward_staging": 0.5, |
| "step": 114 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 69.234375, |
| "epoch": 57.5, |
| "grad_norm": 11.266561852713883, |
| "kl": 2.2763671875, |
| "learning_rate": 1.7933533402912354e-05, |
| "loss": 0.0311, |
| "reward": 4.9375, |
| "reward_std": 1.8707758784294128, |
| "rewards/accuracy_reward_staging": 0.3125, |
| "rewards/format_reward": 0.84375, |
| "rewards/format_reward_staging": 0.96875, |
| "step": 115 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 36.625, |
| "epoch": 58.0, |
| "grad_norm": 2.4520418407831195, |
| "kl": 3.1875, |
| "learning_rate": 1.788010753606722e-05, |
| "loss": 0.0328, |
| "reward": 3.09375, |
| "reward_std": 1.0208056271076202, |
| "rewards/accuracy_reward_staging": 0.21875, |
| "rewards/format_reward": 0.421875, |
| "rewards/format_reward_staging": 0.484375, |
| "step": 116 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 65.109375, |
| "epoch": 58.5, |
| "grad_norm": 3.085523482612536, |
| "kl": 1.400390625, |
| "learning_rate": 1.782608156852414e-05, |
| "loss": -0.1165, |
| "reward": 3.90625, |
| "reward_std": 2.0603334307670593, |
| "rewards/accuracy_reward_staging": 0.21875, |
| "rewards/format_reward": 0.796875, |
| "rewards/format_reward_staging": 0.921875, |
| "step": 117 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 29.5625, |
| "epoch": 59.0, |
| "grad_norm": 10.594589302032892, |
| "kl": 2.95703125, |
| "learning_rate": 1.777145961456971e-05, |
| "loss": 0.0282, |
| "reward": 3.1875, |
| "reward_std": 1.6452402472496033, |
| "rewards/accuracy_reward_staging": 0.234375, |
| "rewards/format_reward": 0.375, |
| "rewards/format_reward_staging": 0.46875, |
| "step": 118 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 37.84375, |
| "epoch": 59.5, |
| "grad_norm": 4.104628944341923, |
| "kl": 2.490234375, |
| "learning_rate": 1.7716245833877202e-05, |
| "loss": 0.0598, |
| "reward": 1.109375, |
| "reward_std": 0.6517204642295837, |
| "rewards/accuracy_reward_staging": 0.015625, |
| "rewards/format_reward": 0.46875, |
| "rewards/format_reward_staging": 0.484375, |
| "step": 119 |
| }, |
| { |
| "epoch": 60.0, |
| "grad_norm": 2.7051939689212157, |
| "learning_rate": 1.766044443118978e-05, |
| "loss": -0.0924, |
| "step": 120 |
| }, |
| { |
| "epoch": 60.0, |
| "eval_clip_ratio": 0.0, |
| "eval_completion_length": 49.0859375, |
| "eval_kl": 2.880859375, |
| "eval_loss": -0.00882991123944521, |
| "eval_reward": 4.03125, |
| "eval_reward_std": 1.9904271215200424, |
| "eval_rewards/accuracy_reward_staging": 0.265625, |
| "eval_rewards/format_reward": 0.6640625, |
| "eval_rewards/format_reward_staging": 0.7109375, |
| "eval_runtime": 7.683, |
| "eval_samples_per_second": 1.041, |
| "eval_steps_per_second": 0.13, |
| "step": 120 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 54.3359375, |
| "epoch": 60.5, |
| "grad_norm": 8.217105476776826, |
| "kl": 1.54443359375, |
| "learning_rate": 1.7604059656000313e-05, |
| "loss": 0.0161, |
| "reward": 5.5, |
| "reward_std": 1.8617027252912521, |
| "rewards/accuracy_reward_staging": 0.390625, |
| "rewards/format_reward": 0.7421875, |
| "rewards/format_reward_staging": 0.8515625, |
| "step": 121 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 48.21875, |
| "epoch": 61.0, |
| "grad_norm": 8.76714381411277, |
| "kl": 2.35546875, |
| "learning_rate": 1.7547095802227723e-05, |
| "loss": 0.0302, |
| "reward": 2.8125, |
| "reward_std": 1.4936581254005432, |
| "rewards/accuracy_reward_staging": 0.140625, |
| "rewards/format_reward": 0.703125, |
| "rewards/format_reward_staging": 0.703125, |
| "step": 122 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 35.328125, |
| "epoch": 61.5, |
| "grad_norm": 28.267166519601638, |
| "kl": 6.83203125, |
| "learning_rate": 1.7489557207890025e-05, |
| "loss": 0.0964, |
| "reward": 3.3125, |
| "reward_std": 0.75, |
| "rewards/accuracy_reward_staging": 0.234375, |
| "rewards/format_reward": 0.484375, |
| "rewards/format_reward_staging": 0.484375, |
| "step": 123 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 60.15625, |
| "epoch": 62.0, |
| "grad_norm": 6.154728910682446, |
| "kl": 2.5, |
| "learning_rate": 1.7431448254773943e-05, |
| "loss": -0.0487, |
| "reward": 3.421875, |
| "reward_std": 2.588469222187996, |
| "rewards/accuracy_reward_staging": 0.15625, |
| "rewards/format_reward": 0.921875, |
| "rewards/format_reward_staging": 0.9375, |
| "step": 124 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 50.28125, |
| "epoch": 62.5, |
| "grad_norm": 6.026022088929383, |
| "kl": 2.130859375, |
| "learning_rate": 1.737277336810124e-05, |
| "loss": 0.0602, |
| "reward": 2.40625, |
| "reward_std": 2.038558602333069, |
| "rewards/accuracy_reward_staging": 0.09375, |
| "rewards/format_reward": 0.734375, |
| "rewards/format_reward_staging": 0.734375, |
| "step": 125 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 50.28125, |
| "epoch": 63.0, |
| "grad_norm": 1.6439981557613697, |
| "kl": 1.34765625, |
| "learning_rate": 1.7313537016191706e-05, |
| "loss": 0.0648, |
| "reward": 3.0625, |
| "reward_std": 1.25, |
| "rewards/accuracy_reward_staging": 0.15625, |
| "rewards/format_reward": 0.75, |
| "rewards/format_reward_staging": 0.75, |
| "step": 126 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 63.234375, |
| "epoch": 63.5, |
| "grad_norm": 3.1039400606839846, |
| "kl": 1.615234375, |
| "learning_rate": 1.7253743710122877e-05, |
| "loss": -0.0558, |
| "reward": 4.203125, |
| "reward_std": 2.4739063382148743, |
| "rewards/accuracy_reward_staging": 0.234375, |
| "rewards/format_reward": 0.9375, |
| "rewards/format_reward_staging": 0.921875, |
| "step": 127 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 38.40625, |
| "epoch": 64.0, |
| "grad_norm": 3.9232383422378714, |
| "kl": 2.087890625, |
| "learning_rate": 1.7193398003386514e-05, |
| "loss": 0.0734, |
| "reward": 1.140625, |
| "reward_std": 0.6875, |
| "rewards/accuracy_reward_staging": 0.015625, |
| "rewards/format_reward": 0.484375, |
| "rewards/format_reward_staging": 0.5, |
| "step": 128 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 72.671875, |
| "epoch": 64.5, |
| "grad_norm": 3.018724764911343, |
| "kl": 1.263671875, |
| "learning_rate": 1.713250449154182e-05, |
| "loss": 0.0133, |
| "reward": 4.75, |
| "reward_std": 2.8038886189460754, |
| "rewards/accuracy_reward_staging": 0.28125, |
| "rewards/format_reward": 0.953125, |
| "rewards/format_reward_staging": 0.984375, |
| "step": 129 |
| }, |
| { |
| "epoch": 65.0, |
| "grad_norm": 3.0081574444150787, |
| "learning_rate": 1.7071067811865477e-05, |
| "loss": 0.0217, |
| "step": 130 |
| }, |
| { |
| "epoch": 65.0, |
| "eval_clip_ratio": 0.0, |
| "eval_completion_length": 71.6171875, |
| "eval_kl": 1.7802734375, |
| "eval_loss": 0.047457288950681686, |
| "eval_reward": 3.8203125, |
| "eval_reward_std": 1.6125783324241638, |
| "eval_rewards/accuracy_reward_staging": 0.234375, |
| "eval_rewards/format_reward": 0.7265625, |
| "eval_rewards/format_reward_staging": 0.75, |
| "eval_runtime": 9.8391, |
| "eval_samples_per_second": 0.813, |
| "eval_steps_per_second": 0.102, |
| "step": 130 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 44.1796875, |
| "epoch": 65.5, |
| "grad_norm": 87.36549568538945, |
| "kl": 5.3486328125, |
| "learning_rate": 1.700909264299851e-05, |
| "loss": 0.2066, |
| "reward": 2.609375, |
| "reward_std": 1.7113949656486511, |
| "rewards/accuracy_reward_staging": 0.1640625, |
| "rewards/format_reward": 0.4765625, |
| "rewards/format_reward_staging": 0.4921875, |
| "step": 131 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 100.0, |
| "epoch": 66.0, |
| "grad_norm": 8.157221542562647, |
| "kl": 1.572265625, |
| "learning_rate": 1.6946583704589973e-05, |
| "loss": -0.028, |
| "reward": 4.15625, |
| "reward_std": 0.9789125919342041, |
| "rewards/accuracy_reward_staging": 0.21875, |
| "rewards/format_reward": 0.96875, |
| "rewards/format_reward_staging": 1.0, |
| "step": 132 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 57.234375, |
| "epoch": 66.5, |
| "grad_norm": 11.318664763937058, |
| "kl": 1.568359375, |
| "learning_rate": 1.688354575693754e-05, |
| "loss": 0.0296, |
| "reward": 3.65625, |
| "reward_std": 2.0427924394607544, |
| "rewards/accuracy_reward_staging": 0.21875, |
| "rewards/format_reward": 0.71875, |
| "rewards/format_reward_staging": 0.75, |
| "step": 133 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 48.5, |
| "epoch": 67.0, |
| "grad_norm": 4.231087003668829, |
| "kl": 1.572265625, |
| "learning_rate": 1.6819983600624986e-05, |
| "loss": -0.0374, |
| "reward": 6.015625, |
| "reward_std": 1.1542446613311768, |
| "rewards/accuracy_reward_staging": 0.46875, |
| "rewards/format_reward": 0.640625, |
| "rewards/format_reward_staging": 0.6875, |
| "step": 134 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 49.75, |
| "epoch": 67.5, |
| "grad_norm": 5.475126751687303, |
| "kl": 1.1953125, |
| "learning_rate": 1.6755902076156606e-05, |
| "loss": -0.0876, |
| "reward": 3.875, |
| "reward_std": 0.32214587926864624, |
| "rewards/accuracy_reward_staging": 0.25, |
| "rewards/format_reward": 0.640625, |
| "rewards/format_reward_staging": 0.734375, |
| "step": 135 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 49.03125, |
| "epoch": 68.0, |
| "grad_norm": 3.1878976206293186, |
| "kl": 1.29296875, |
| "learning_rate": 1.6691306063588583e-05, |
| "loss": 0.039, |
| "reward": 4.390625, |
| "reward_std": 2.472952723503113, |
| "rewards/accuracy_reward_staging": 0.296875, |
| "rewards/format_reward": 0.6875, |
| "rewards/format_reward_staging": 0.734375, |
| "step": 136 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 43.78125, |
| "epoch": 68.5, |
| "grad_norm": 5.889272400451515, |
| "kl": 2.4609375, |
| "learning_rate": 1.6626200482157378e-05, |
| "loss": -0.077, |
| "reward": 3.515625, |
| "reward_std": 1.154121845960617, |
| "rewards/accuracy_reward_staging": 0.21875, |
| "rewards/format_reward": 0.609375, |
| "rewards/format_reward_staging": 0.71875, |
| "step": 137 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 45.75, |
| "epoch": 69.0, |
| "grad_norm": 3.7471899234962898, |
| "kl": 1.05859375, |
| "learning_rate": 1.6560590289905074e-05, |
| "loss": 0.0093, |
| "reward": 4.890625, |
| "reward_std": 2.029541850090027, |
| "rewards/accuracy_reward_staging": 0.34375, |
| "rewards/format_reward": 0.71875, |
| "rewards/format_reward_staging": 0.734375, |
| "step": 138 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 29.78125, |
| "epoch": 69.5, |
| "grad_norm": 4.907092274694152, |
| "kl": 1.103515625, |
| "learning_rate": 1.6494480483301836e-05, |
| "loss": 0.024, |
| "reward": 4.6875, |
| "reward_std": 2.245893716812134, |
| "rewards/accuracy_reward_staging": 0.375, |
| "rewards/format_reward": 0.453125, |
| "rewards/format_reward_staging": 0.484375, |
| "step": 139 |
| }, |
| { |
| "epoch": 70.0, |
| "grad_norm": 6.180078269058641, |
| "learning_rate": 1.6427876096865394e-05, |
| "loss": -0.0735, |
| "step": 140 |
| }, |
| { |
| "epoch": 70.0, |
| "eval_clip_ratio": 0.0, |
| "eval_completion_length": 45.1171875, |
| "eval_kl": 1.2315673828125, |
| "eval_loss": -0.0003372877836227417, |
| "eval_reward": 4.1484375, |
| "eval_reward_std": 1.8942626863718033, |
| "eval_rewards/accuracy_reward_staging": 0.2734375, |
| "eval_rewards/format_reward": 0.7109375, |
| "eval_rewards/format_reward_staging": 0.703125, |
| "eval_runtime": 7.3533, |
| "eval_samples_per_second": 1.088, |
| "eval_steps_per_second": 0.136, |
| "step": 140 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 57.609375, |
| "epoch": 70.5, |
| "grad_norm": 3.7369372987659357, |
| "kl": 1.7197265625, |
| "learning_rate": 1.636078220277764e-05, |
| "loss": -0.0225, |
| "reward": 4.7265625, |
| "reward_std": 2.5463077425956726, |
| "rewards/accuracy_reward_staging": 0.2890625, |
| "rewards/format_reward": 0.890625, |
| "rewards/format_reward_staging": 0.9453125, |
| "step": 141 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 41.0, |
| "epoch": 71.0, |
| "grad_norm": 7.580877395639736, |
| "kl": 2.69921875, |
| "learning_rate": 1.6293203910498375e-05, |
| "loss": 0.0622, |
| "reward": 2.546875, |
| "reward_std": 1.2390142679214478, |
| "rewards/accuracy_reward_staging": 0.15625, |
| "rewards/format_reward": 0.484375, |
| "rewards/format_reward_staging": 0.5, |
| "step": 142 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 46.046875, |
| "epoch": 71.5, |
| "grad_norm": 6.018587414833117, |
| "kl": 1.54296875, |
| "learning_rate": 1.6225146366376198e-05, |
| "loss": -0.0145, |
| "reward": 7.125, |
| "reward_std": 3.5123836994171143, |
| "rewards/accuracy_reward_staging": 0.578125, |
| "rewards/format_reward": 0.625, |
| "rewards/format_reward_staging": 0.71875, |
| "step": 143 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 44.09375, |
| "epoch": 72.0, |
| "grad_norm": 12.708930539177073, |
| "kl": 2.927734375, |
| "learning_rate": 1.6156614753256583e-05, |
| "loss": 0.0341, |
| "reward": 1.40625, |
| "reward_std": 1.3823386430740356, |
| "rewards/accuracy_reward_staging": 0.046875, |
| "rewards/format_reward": 0.296875, |
| "rewards/format_reward_staging": 0.640625, |
| "step": 144 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 41.6875, |
| "epoch": 72.5, |
| "grad_norm": 26.335774506376318, |
| "kl": 2.484375, |
| "learning_rate": 1.608761429008721e-05, |
| "loss": 0.0802, |
| "reward": 4.0, |
| "reward_std": 2.5394824892282486, |
| "rewards/accuracy_reward_staging": 0.296875, |
| "rewards/format_reward": 0.34375, |
| "rewards/format_reward_staging": 0.6875, |
| "step": 145 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 43.0, |
| "epoch": 73.0, |
| "grad_norm": 10.027812774226838, |
| "kl": 2.0234375, |
| "learning_rate": 1.6018150231520486e-05, |
| "loss": -0.0637, |
| "reward": 5.078125, |
| "reward_std": 2.6153001338243484, |
| "rewards/accuracy_reward_staging": 0.390625, |
| "rewards/format_reward": 0.515625, |
| "rewards/format_reward_staging": 0.65625, |
| "step": 146 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 60.21875, |
| "epoch": 73.5, |
| "grad_norm": 7.224717472496968, |
| "kl": 2.08203125, |
| "learning_rate": 1.5948227867513416e-05, |
| "loss": -0.0085, |
| "reward": 6.375, |
| "reward_std": 1.8922232389450073, |
| "rewards/accuracy_reward_staging": 0.453125, |
| "rewards/format_reward": 0.875, |
| "rewards/format_reward_staging": 0.96875, |
| "step": 147 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 28.8125, |
| "epoch": 74.0, |
| "grad_norm": 3.90403458561103, |
| "kl": 1.033203125, |
| "learning_rate": 1.5877852522924733e-05, |
| "loss": -0.0176, |
| "reward": 4.671875, |
| "reward_std": 2.339739680290222, |
| "rewards/accuracy_reward_staging": 0.375, |
| "rewards/format_reward": 0.46875, |
| "rewards/format_reward_staging": 0.453125, |
| "step": 148 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 56.609375, |
| "epoch": 74.5, |
| "grad_norm": 5.8341448086549965, |
| "kl": 1.45703125, |
| "learning_rate": 1.5807029557109398e-05, |
| "loss": -0.0544, |
| "reward": 5.734375, |
| "reward_std": 2.3053803741931915, |
| "rewards/accuracy_reward_staging": 0.390625, |
| "rewards/format_reward": 0.90625, |
| "rewards/format_reward_staging": 0.921875, |
| "step": 149 |
| }, |
| { |
| "epoch": 75.0, |
| "grad_norm": 4.711908468937371, |
| "learning_rate": 1.573576436351046e-05, |
| "loss": 0.0152, |
| "step": 150 |
| }, |
| { |
| "epoch": 75.0, |
| "eval_clip_ratio": 0.0, |
| "eval_completion_length": 45.28125, |
| "eval_kl": 1.6767578125, |
| "eval_loss": -0.009632267989218235, |
| "eval_reward": 3.78125, |
| "eval_reward_std": 0.855195626616478, |
| "eval_rewards/accuracy_reward_staging": 0.234375, |
| "eval_rewards/format_reward": 0.7109375, |
| "eval_rewards/format_reward_staging": 0.7265625, |
| "eval_runtime": 7.9037, |
| "eval_samples_per_second": 1.012, |
| "eval_steps_per_second": 0.127, |
| "step": 150 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 38.3984375, |
| "epoch": 75.5, |
| "grad_norm": 29.133371779710487, |
| "kl": 1.69482421875, |
| "learning_rate": 1.566406236924833e-05, |
| "loss": 0.038, |
| "reward": 3.4609375, |
| "reward_std": 0.8732095211744308, |
| "rewards/accuracy_reward_staging": 0.2265625, |
| "rewards/format_reward": 0.5859375, |
| "rewards/format_reward_staging": 0.609375, |
| "step": 151 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 45.03125, |
| "epoch": 76.0, |
| "grad_norm": 71.16435014998271, |
| "kl": 1.560546875, |
| "learning_rate": 1.5591929034707468e-05, |
| "loss": 0.0866, |
| "reward": 1.4375, |
| "reward_std": 0.14433756470680237, |
| "rewards/accuracy_reward_staging": 0.0, |
| "rewards/format_reward": 0.734375, |
| "rewards/format_reward_staging": 0.703125, |
| "step": 152 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 44.6875, |
| "epoch": 76.5, |
| "grad_norm": 1.0512585294331593, |
| "kl": 0.98028564453125, |
| "learning_rate": 1.5519369853120584e-05, |
| "loss": 0.0315, |
| "reward": 3.984375, |
| "reward_std": 0.0625, |
| "rewards/accuracy_reward_staging": 0.25, |
| "rewards/format_reward": 0.734375, |
| "rewards/format_reward_staging": 0.75, |
| "step": 153 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 43.375, |
| "epoch": 77.0, |
| "grad_norm": 4.22052916794067, |
| "kl": 1.216796875, |
| "learning_rate": 1.5446390350150272e-05, |
| "loss": -0.0304, |
| "reward": 3.125, |
| "reward_std": 1.421726554632187, |
| "rewards/accuracy_reward_staging": 0.171875, |
| "rewards/format_reward": 0.671875, |
| "rewards/format_reward_staging": 0.734375, |
| "step": 154 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 43.0625, |
| "epoch": 77.5, |
| "grad_norm": 3.803510821246787, |
| "kl": 1.2509765625, |
| "learning_rate": 1.5372996083468242e-05, |
| "loss": 0.0057, |
| "reward": 3.75, |
| "reward_std": 0.7565859854221344, |
| "rewards/accuracy_reward_staging": 0.234375, |
| "rewards/format_reward": 0.671875, |
| "rewards/format_reward_staging": 0.734375, |
| "step": 155 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 43.6875, |
| "epoch": 78.0, |
| "grad_norm": 1.6103062564934338, |
| "kl": 1.2275390625, |
| "learning_rate": 1.529919264233205e-05, |
| "loss": 0.0116, |
| "reward": 3.484375, |
| "reward_std": 1.0873424708843231, |
| "rewards/accuracy_reward_staging": 0.203125, |
| "rewards/format_reward": 0.703125, |
| "rewards/format_reward_staging": 0.75, |
| "step": 156 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 46.46875, |
| "epoch": 78.5, |
| "grad_norm": 3.73947644081386, |
| "kl": 1.92578125, |
| "learning_rate": 1.5224985647159489e-05, |
| "loss": -0.0074, |
| "reward": 1.421875, |
| "reward_std": 0.17430339753627777, |
| "rewards/accuracy_reward_staging": 0.0, |
| "rewards/format_reward": 0.671875, |
| "rewards/format_reward_staging": 0.75, |
| "step": 157 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 48.8125, |
| "epoch": 79.0, |
| "grad_norm": 168.19806472829526, |
| "kl": 4.16015625, |
| "learning_rate": 1.5150380749100545e-05, |
| "loss": 0.185, |
| "reward": 6.15625, |
| "reward_std": 1.3114574551582336, |
| "rewards/accuracy_reward_staging": 0.46875, |
| "rewards/format_reward": 0.71875, |
| "rewards/format_reward_staging": 0.75, |
| "step": 158 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 74.703125, |
| "epoch": 79.5, |
| "grad_norm": 8.124079093394728, |
| "kl": 3.5859375, |
| "learning_rate": 1.5075383629607043e-05, |
| "loss": 0.0754, |
| "reward": 4.140625, |
| "reward_std": 0.8125, |
| "rewards/accuracy_reward_staging": 0.265625, |
| "rewards/format_reward": 0.734375, |
| "rewards/format_reward_staging": 0.75, |
| "step": 159 |
| }, |
| { |
| "epoch": 80.0, |
| "grad_norm": 24.584018478630828, |
| "learning_rate": 1.5000000000000002e-05, |
| "loss": 0.0492, |
| "step": 160 |
| }, |
| { |
| "epoch": 80.0, |
| "eval_clip_ratio": 0.0, |
| "eval_completion_length": 48.65625, |
| "eval_kl": 2.4384765625, |
| "eval_loss": 0.05181257054209709, |
| "eval_reward": 3.734375, |
| "eval_reward_std": 0.824847549200058, |
| "eval_rewards/accuracy_reward_staging": 0.2265625, |
| "eval_rewards/format_reward": 0.734375, |
| "eval_rewards/format_reward_staging": 0.734375, |
| "eval_runtime": 7.9932, |
| "eval_samples_per_second": 1.001, |
| "eval_steps_per_second": 0.125, |
| "step": 160 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 76.4921875, |
| "epoch": 80.5, |
| "grad_norm": 6.67934298891478, |
| "kl": 4.8759765625, |
| "learning_rate": 1.4924235601034673e-05, |
| "loss": -0.0417, |
| "reward": 2.34375, |
| "reward_std": 0.5171605423092842, |
| "rewards/accuracy_reward_staging": 0.1171875, |
| "rewards/format_reward": 0.6015625, |
| "rewards/format_reward_staging": 0.5703125, |
| "step": 161 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 54.84375, |
| "epoch": 81.0, |
| "grad_norm": 3.819874734506586, |
| "kl": 1.767578125, |
| "learning_rate": 1.4848096202463373e-05, |
| "loss": -0.0168, |
| "reward": 6.125, |
| "reward_std": 1.4207825064659119, |
| "rewards/accuracy_reward_staging": 0.421875, |
| "rewards/format_reward": 0.953125, |
| "rewards/format_reward_staging": 0.953125, |
| "step": 162 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 98.859375, |
| "epoch": 81.5, |
| "grad_norm": 3.0908583823850546, |
| "kl": 1.6953125, |
| "learning_rate": 1.4771587602596085e-05, |
| "loss": 0.0503, |
| "reward": 3.515625, |
| "reward_std": 1.2676234245300293, |
| "rewards/accuracy_reward_staging": 0.203125, |
| "rewards/format_reward": 0.75, |
| "rewards/format_reward_staging": 0.734375, |
| "step": 163 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 84.71875, |
| "epoch": 82.0, |
| "grad_norm": 13.65817282343041, |
| "kl": 3.357421875, |
| "learning_rate": 1.469471562785891e-05, |
| "loss": 0.0879, |
| "reward": 3.984375, |
| "reward_std": 1.3739574551582336, |
| "rewards/accuracy_reward_staging": 0.25, |
| "rewards/format_reward": 0.71875, |
| "rewards/format_reward_staging": 0.765625, |
| "step": 164 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 56.1875, |
| "epoch": 82.5, |
| "grad_norm": 9.321438696837953, |
| "kl": 2.984375, |
| "learning_rate": 1.4617486132350343e-05, |
| "loss": 0.0382, |
| "reward": 6.53125, |
| "reward_std": 1.5894616693258286, |
| "rewards/accuracy_reward_staging": 0.46875, |
| "rewards/format_reward": 0.921875, |
| "rewards/format_reward_staging": 0.921875, |
| "step": 165 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 117.40625, |
| "epoch": 83.0, |
| "grad_norm": 5.820169059348112, |
| "kl": 2.486328125, |
| "learning_rate": 1.4539904997395468e-05, |
| "loss": 0.046, |
| "reward": 0.984375, |
| "reward_std": 0.0625, |
| "rewards/accuracy_reward_staging": 0.0, |
| "rewards/format_reward": 0.5, |
| "rewards/format_reward_staging": 0.484375, |
| "step": 166 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 56.96875, |
| "epoch": 83.5, |
| "grad_norm": 3.3619746674675457, |
| "kl": 1.662109375, |
| "learning_rate": 1.4461978131098089e-05, |
| "loss": 0.0254, |
| "reward": 3.953125, |
| "reward_std": 0.1875, |
| "rewards/accuracy_reward_staging": 0.25, |
| "rewards/format_reward": 0.71875, |
| "rewards/format_reward_staging": 0.734375, |
| "step": 167 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 69.4375, |
| "epoch": 84.0, |
| "grad_norm": 4.984323181253807, |
| "kl": 2.224609375, |
| "learning_rate": 1.4383711467890776e-05, |
| "loss": -0.0085, |
| "reward": 3.5, |
| "reward_std": 1.2682048827409744, |
| "rewards/accuracy_reward_staging": 0.21875, |
| "rewards/format_reward": 0.640625, |
| "rewards/format_reward_staging": 0.671875, |
| "step": 168 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 53.859375, |
| "epoch": 84.5, |
| "grad_norm": 10.010983408872603, |
| "kl": 2.3125, |
| "learning_rate": 1.4305110968082953e-05, |
| "loss": -0.028, |
| "reward": 3.75, |
| "reward_std": 2.024695038795471, |
| "rewards/accuracy_reward_staging": 0.234375, |
| "rewards/format_reward": 0.703125, |
| "rewards/format_reward_staging": 0.703125, |
| "step": 169 |
| }, |
| { |
| "epoch": 85.0, |
| "grad_norm": 4.824303447060251, |
| "learning_rate": 1.4226182617406996e-05, |
| "loss": 0.0597, |
| "step": 170 |
| }, |
| { |
| "epoch": 85.0, |
| "eval_clip_ratio": 0.0, |
| "eval_completion_length": 70.4140625, |
| "eval_kl": 2.732421875, |
| "eval_loss": 0.05451072007417679, |
| "eval_reward": 3.640625, |
| "eval_reward_std": 0.930374264717102, |
| "eval_rewards/accuracy_reward_staging": 0.21875, |
| "eval_rewards/format_reward": 0.7265625, |
| "eval_rewards/format_reward_staging": 0.7265625, |
| "eval_runtime": 13.695, |
| "eval_samples_per_second": 0.584, |
| "eval_steps_per_second": 0.073, |
| "step": 170 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 89.2734375, |
| "epoch": 85.5, |
| "grad_norm": 19.993154281622356, |
| "kl": 2.4169921875, |
| "learning_rate": 1.4146932426562391e-05, |
| "loss": 0.0789, |
| "reward": 3.4296875, |
| "reward_std": 0.9237357676029205, |
| "rewards/accuracy_reward_staging": 0.2265625, |
| "rewards/format_reward": 0.5625, |
| "rewards/format_reward_staging": 0.6015625, |
| "step": 171 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 58.90625, |
| "epoch": 86.0, |
| "grad_norm": 5.701163289666223, |
| "kl": 1.955078125, |
| "learning_rate": 1.4067366430758004e-05, |
| "loss": 0.0564, |
| "reward": 4.4375, |
| "reward_std": 0.21039125323295593, |
| "rewards/accuracy_reward_staging": 0.25, |
| "rewards/format_reward": 0.953125, |
| "rewards/format_reward_staging": 0.984375, |
| "step": 172 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 104.9375, |
| "epoch": 86.5, |
| "grad_norm": 8.75300625649785, |
| "kl": 1.7421875, |
| "learning_rate": 1.3987490689252463e-05, |
| "loss": 0.0637, |
| "reward": 1.4375, |
| "reward_std": 0.21039125323295593, |
| "rewards/accuracy_reward_staging": 0.0, |
| "rewards/format_reward": 0.703125, |
| "rewards/format_reward_staging": 0.734375, |
| "step": 173 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 62.90625, |
| "epoch": 87.0, |
| "grad_norm": 62.159358500619824, |
| "kl": 23.37890625, |
| "learning_rate": 1.3907311284892737e-05, |
| "loss": 0.1646, |
| "reward": 5.515625, |
| "reward_std": 2.2276171147823334, |
| "rewards/accuracy_reward_staging": 0.421875, |
| "rewards/format_reward": 0.640625, |
| "rewards/format_reward_staging": 0.65625, |
| "step": 174 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 102.90625, |
| "epoch": 87.5, |
| "grad_norm": 12.112575261726283, |
| "kl": 3.974609375, |
| "learning_rate": 1.3826834323650899e-05, |
| "loss": 0.0835, |
| "reward": 3.9375, |
| "reward_std": 1.6707825064659119, |
| "rewards/accuracy_reward_staging": 0.25, |
| "rewards/format_reward": 0.71875, |
| "rewards/format_reward_staging": 0.71875, |
| "step": 175 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 139.6875, |
| "epoch": 88.0, |
| "grad_norm": 35.23828311935734, |
| "kl": 7.0, |
| "learning_rate": 1.3746065934159123e-05, |
| "loss": 0.2503, |
| "reward": 3.703125, |
| "reward_std": 1.1275950223207474, |
| "rewards/accuracy_reward_staging": 0.234375, |
| "rewards/format_reward": 0.6875, |
| "rewards/format_reward_staging": 0.671875, |
| "step": 176 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 73.734375, |
| "epoch": 88.5, |
| "grad_norm": 34.69838372391708, |
| "kl": 6.234375, |
| "learning_rate": 1.3665012267242974e-05, |
| "loss": 0.3016, |
| "reward": 3.75, |
| "reward_std": 1.6523646861314774, |
| "rewards/accuracy_reward_staging": 0.203125, |
| "rewards/format_reward": 0.875, |
| "rewards/format_reward_staging": 0.84375, |
| "step": 177 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 247.34375, |
| "epoch": 89.0, |
| "grad_norm": 8.743276590821418, |
| "kl": 1.91796875, |
| "learning_rate": 1.3583679495453e-05, |
| "loss": 0.1189, |
| "reward": 2.765625, |
| "reward_std": 1.4041407108306885, |
| "rewards/accuracy_reward_staging": 0.1875, |
| "rewards/format_reward": 0.453125, |
| "rewards/format_reward_staging": 0.4375, |
| "step": 178 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 185.65625, |
| "epoch": 89.5, |
| "grad_norm": 64.71970490888229, |
| "kl": 10.216796875, |
| "learning_rate": 1.3502073812594677e-05, |
| "loss": 0.2277, |
| "reward": 1.28125, |
| "reward_std": 0.48439764976501465, |
| "rewards/accuracy_reward_staging": 0.0, |
| "rewards/format_reward": 0.671875, |
| "rewards/format_reward_staging": 0.609375, |
| "step": 179 |
| }, |
| { |
| "epoch": 90.0, |
| "grad_norm": 641.7646960533058, |
| "learning_rate": 1.342020143325669e-05, |
| "loss": 1.3646, |
| "step": 180 |
| }, |
| { |
| "epoch": 90.0, |
| "eval_clip_ratio": 0.0, |
| "eval_completion_length": 180.421875, |
| "eval_kl": 4.01953125, |
| "eval_loss": 0.2227374017238617, |
| "eval_reward": 2.875, |
| "eval_reward_std": 2.4358191564679146, |
| "eval_rewards/accuracy_reward_staging": 0.171875, |
| "eval_rewards/format_reward": 0.5859375, |
| "eval_rewards/format_reward_staging": 0.5703125, |
| "eval_runtime": 21.5649, |
| "eval_samples_per_second": 0.371, |
| "eval_steps_per_second": 0.046, |
| "step": 180 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 163.03125, |
| "epoch": 90.5, |
| "grad_norm": 28.433179154739406, |
| "kl": 26.9296875, |
| "learning_rate": 1.333806859233771e-05, |
| "loss": 0.2116, |
| "reward": 3.984375, |
| "reward_std": 2.283327080309391, |
| "rewards/accuracy_reward_staging": 0.2734375, |
| "rewards/format_reward": 0.6484375, |
| "rewards/format_reward_staging": 0.6015625, |
| "step": 181 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 197.5625, |
| "epoch": 91.0, |
| "grad_norm": 35.69313057863262, |
| "kl": 9.8515625, |
| "learning_rate": 1.3255681544571568e-05, |
| "loss": 0.5447, |
| "reward": 3.21875, |
| "reward_std": 2.838429868221283, |
| "rewards/accuracy_reward_staging": 0.21875, |
| "rewards/format_reward": 0.53125, |
| "rewards/format_reward_staging": 0.5, |
| "step": 182 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 121.59375, |
| "epoch": 91.5, |
| "grad_norm": 22.53366341132682, |
| "kl": 4.31640625, |
| "learning_rate": 1.3173046564050923e-05, |
| "loss": 0.2777, |
| "reward": 2.71875, |
| "reward_std": 2.1865703761577606, |
| "rewards/accuracy_reward_staging": 0.15625, |
| "rewards/format_reward": 0.578125, |
| "rewards/format_reward_staging": 0.578125, |
| "step": 183 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 70.78125, |
| "epoch": 92.0, |
| "grad_norm": 23.102470088426298, |
| "kl": 3.43359375, |
| "learning_rate": 1.3090169943749475e-05, |
| "loss": 0.2916, |
| "reward": 3.484375, |
| "reward_std": 3.165164679288864, |
| "rewards/accuracy_reward_staging": 0.234375, |
| "rewards/format_reward": 0.59375, |
| "rewards/format_reward_staging": 0.546875, |
| "step": 184 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 85.625, |
| "epoch": 92.5, |
| "grad_norm": 17.631303386576295, |
| "kl": 2.8828125, |
| "learning_rate": 1.300705799504273e-05, |
| "loss": 0.0945, |
| "reward": 3.1875, |
| "reward_std": 1.7888548523187637, |
| "rewards/accuracy_reward_staging": 0.1875, |
| "rewards/format_reward": 0.6875, |
| "rewards/format_reward_staging": 0.625, |
| "step": 185 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 60.96875, |
| "epoch": 93.0, |
| "grad_norm": 17.17673811558118, |
| "kl": 3.25390625, |
| "learning_rate": 1.2923717047227368e-05, |
| "loss": 0.0913, |
| "reward": 3.890625, |
| "reward_std": 3.1874619126319885, |
| "rewards/accuracy_reward_staging": 0.265625, |
| "rewards/format_reward": 0.640625, |
| "rewards/format_reward_staging": 0.59375, |
| "step": 186 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 62.6875, |
| "epoch": 93.5, |
| "grad_norm": 21.78344941308155, |
| "kl": 3.767578125, |
| "learning_rate": 1.284015344703923e-05, |
| "loss": 0.1005, |
| "reward": 3.875, |
| "reward_std": 2.966622516512871, |
| "rewards/accuracy_reward_staging": 0.25, |
| "rewards/format_reward": 0.71875, |
| "rewards/format_reward_staging": 0.65625, |
| "step": 187 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 74.4375, |
| "epoch": 94.0, |
| "grad_norm": 123.72852412016174, |
| "kl": 13.330078125, |
| "learning_rate": 1.2756373558169992e-05, |
| "loss": 0.3738, |
| "reward": 3.796875, |
| "reward_std": 1.500662550330162, |
| "rewards/accuracy_reward_staging": 0.21875, |
| "rewards/format_reward": 0.8125, |
| "rewards/format_reward_staging": 0.796875, |
| "step": 188 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 61.109375, |
| "epoch": 94.5, |
| "grad_norm": 15.826582743437257, |
| "kl": 2.990234375, |
| "learning_rate": 1.267238376078257e-05, |
| "loss": 0.0504, |
| "reward": 6.734375, |
| "reward_std": 4.132839202880859, |
| "rewards/accuracy_reward_staging": 0.515625, |
| "rewards/format_reward": 0.796875, |
| "rewards/format_reward_staging": 0.78125, |
| "step": 189 |
| }, |
| { |
| "epoch": 95.0, |
| "grad_norm": 12.174094790036403, |
| "learning_rate": 1.2588190451025209e-05, |
| "loss": 0.1562, |
| "step": 190 |
| }, |
| { |
| "epoch": 95.0, |
| "eval_clip_ratio": 0.0, |
| "eval_completion_length": 70.90625, |
| "eval_kl": 5.5693359375, |
| "eval_loss": 0.1942664086818695, |
| "eval_reward": 4.5625, |
| "eval_reward_std": 2.227724313735962, |
| "eval_rewards/accuracy_reward_staging": 0.296875, |
| "eval_rewards/format_reward": 0.796875, |
| "eval_rewards/format_reward_staging": 0.796875, |
| "eval_runtime": 11.384, |
| "eval_samples_per_second": 0.703, |
| "eval_steps_per_second": 0.088, |
| "step": 190 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 67.8046875, |
| "epoch": 95.5, |
| "grad_norm": 29.4793161720303, |
| "kl": 7.30078125, |
| "learning_rate": 1.2503800040544417e-05, |
| "loss": 0.1191, |
| "reward": 2.21875, |
| "reward_std": 1.4497334137558937, |
| "rewards/accuracy_reward_staging": 0.078125, |
| "rewards/format_reward": 0.734375, |
| "rewards/format_reward_staging": 0.703125, |
| "step": 191 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 60.0, |
| "epoch": 96.0, |
| "grad_norm": 18.528850308456384, |
| "kl": 5.0078125, |
| "learning_rate": 1.2419218955996677e-05, |
| "loss": -0.0102, |
| "reward": 6.4375, |
| "reward_std": 2.324888586997986, |
| "rewards/accuracy_reward_staging": 0.46875, |
| "rewards/format_reward": 0.859375, |
| "rewards/format_reward_staging": 0.890625, |
| "step": 192 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 65.65625, |
| "epoch": 96.5, |
| "grad_norm": 321.1577068941354, |
| "kl": 10.72265625, |
| "learning_rate": 1.2334453638559057e-05, |
| "loss": 0.3994, |
| "reward": 3.5625, |
| "reward_std": 1.81937974691391, |
| "rewards/accuracy_reward_staging": 0.1875, |
| "rewards/format_reward": 0.84375, |
| "rewards/format_reward_staging": 0.84375, |
| "step": 193 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 124.09375, |
| "epoch": 97.0, |
| "grad_norm": 298.30380544944734, |
| "kl": 16.484375, |
| "learning_rate": 1.2249510543438652e-05, |
| "loss": 1.1348, |
| "reward": 6.0, |
| "reward_std": 3.5904677510261536, |
| "rewards/accuracy_reward_staging": 0.453125, |
| "rewards/format_reward": 0.734375, |
| "rewards/format_reward_staging": 0.734375, |
| "step": 194 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 61.265625, |
| "epoch": 97.5, |
| "grad_norm": 342.2488163586622, |
| "kl": 39.9609375, |
| "learning_rate": 1.2164396139381029e-05, |
| "loss": 0.6107, |
| "reward": 4.109375, |
| "reward_std": 1.6107770651578903, |
| "rewards/accuracy_reward_staging": 0.25, |
| "rewards/format_reward": 0.796875, |
| "rewards/format_reward_staging": 0.8125, |
| "step": 195 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 71.5, |
| "epoch": 98.0, |
| "grad_norm": 332.30146930549876, |
| "kl": 18.890625, |
| "learning_rate": 1.2079116908177592e-05, |
| "loss": 0.7398, |
| "reward": 4.28125, |
| "reward_std": 3.740285500884056, |
| "rewards/accuracy_reward_staging": 0.3125, |
| "rewards/format_reward": 0.5625, |
| "rewards/format_reward_staging": 0.59375, |
| "step": 196 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 57.046875, |
| "epoch": 98.5, |
| "grad_norm": 40.05423534554934, |
| "kl": 9.9453125, |
| "learning_rate": 1.1993679344171973e-05, |
| "loss": -0.0582, |
| "reward": 5.203125, |
| "reward_std": 3.098964586853981, |
| "rewards/accuracy_reward_staging": 0.375, |
| "rewards/format_reward": 0.734375, |
| "rewards/format_reward_staging": 0.71875, |
| "step": 197 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 72.75, |
| "epoch": 99.0, |
| "grad_norm": 190.25413324992218, |
| "kl": 18.7421875, |
| "learning_rate": 1.190808995376545e-05, |
| "loss": 0.4647, |
| "reward": 2.796875, |
| "reward_std": 2.5183838307857513, |
| "rewards/accuracy_reward_staging": 0.15625, |
| "rewards/format_reward": 0.625, |
| "rewards/format_reward_staging": 0.609375, |
| "step": 198 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 75.15625, |
| "epoch": 99.5, |
| "grad_norm": 77.45958102748877, |
| "kl": 8.5078125, |
| "learning_rate": 1.1822355254921478e-05, |
| "loss": 0.2798, |
| "reward": 5.265625, |
| "reward_std": 4.132492363452911, |
| "rewards/accuracy_reward_staging": 0.40625, |
| "rewards/format_reward": 0.59375, |
| "rewards/format_reward_staging": 0.609375, |
| "step": 199 |
| }, |
| { |
| "epoch": 100.0, |
| "grad_norm": 41.16387064311233, |
| "learning_rate": 1.1736481776669307e-05, |
| "loss": 0.1693, |
| "step": 200 |
| }, |
| { |
| "epoch": 100.0, |
| "eval_clip_ratio": 0.0, |
| "eval_completion_length": 74.859375, |
| "eval_kl": 11.505859375, |
| "eval_loss": 0.30862951278686523, |
| "eval_reward": 3.7578125, |
| "eval_reward_std": 3.54784195125103, |
| "eval_rewards/accuracy_reward_staging": 0.2578125, |
| "eval_rewards/format_reward": 0.59375, |
| "eval_rewards/format_reward_staging": 0.5859375, |
| "eval_runtime": 20.7253, |
| "eval_samples_per_second": 0.386, |
| "eval_steps_per_second": 0.048, |
| "step": 200 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 62.8984375, |
| "epoch": 100.5, |
| "grad_norm": 46.99447269815204, |
| "kl": 7.478515625, |
| "learning_rate": 1.1650476058606776e-05, |
| "loss": 0.0531, |
| "reward": 3.7890625, |
| "reward_std": 2.675357274711132, |
| "rewards/accuracy_reward_staging": 0.2421875, |
| "rewards/format_reward": 0.6875, |
| "rewards/format_reward_staging": 0.6796875, |
| "step": 201 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 112.5, |
| "epoch": 101.0, |
| "grad_norm": 87.46696486815911, |
| "kl": 10.09375, |
| "learning_rate": 1.156434465040231e-05, |
| "loss": 0.3972, |
| "reward": 3.765625, |
| "reward_std": 3.323235496878624, |
| "rewards/accuracy_reward_staging": 0.265625, |
| "rewards/format_reward": 0.5625, |
| "rewards/format_reward_staging": 0.546875, |
| "step": 202 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 62.71875, |
| "epoch": 101.5, |
| "grad_norm": 18.374497201547754, |
| "kl": 3.76953125, |
| "learning_rate": 1.1478094111296109e-05, |
| "loss": -0.0432, |
| "reward": 3.28125, |
| "reward_std": 3.127034693956375, |
| "rewards/accuracy_reward_staging": 0.1875, |
| "rewards/format_reward": 0.703125, |
| "rewards/format_reward_staging": 0.703125, |
| "step": 203 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 71.78125, |
| "epoch": 102.0, |
| "grad_norm": 29.592536211203182, |
| "kl": 2.9140625, |
| "learning_rate": 1.1391731009600655e-05, |
| "loss": 0.1431, |
| "reward": 5.21875, |
| "reward_std": 4.481558993458748, |
| "rewards/accuracy_reward_staging": 0.390625, |
| "rewards/format_reward": 0.65625, |
| "rewards/format_reward_staging": 0.65625, |
| "step": 204 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 123.96875, |
| "epoch": 102.5, |
| "grad_norm": 33.395337096968554, |
| "kl": 2.564453125, |
| "learning_rate": 1.130526192220052e-05, |
| "loss": 0.2351, |
| "reward": 5.3125, |
| "reward_std": 3.812277674674988, |
| "rewards/accuracy_reward_staging": 0.390625, |
| "rewards/format_reward": 0.6875, |
| "rewards/format_reward_staging": 0.71875, |
| "step": 205 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 70.5625, |
| "epoch": 103.0, |
| "grad_norm": 19.970853229745217, |
| "kl": 2.921875, |
| "learning_rate": 1.1218693434051475e-05, |
| "loss": 0.0446, |
| "reward": 4.78125, |
| "reward_std": 2.263821601867676, |
| "rewards/accuracy_reward_staging": 0.328125, |
| "rewards/format_reward": 0.765625, |
| "rewards/format_reward_staging": 0.734375, |
| "step": 206 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 68.78125, |
| "epoch": 103.5, |
| "grad_norm": 14.17204450158029, |
| "kl": 2.71875, |
| "learning_rate": 1.113203213767907e-05, |
| "loss": 0.0621, |
| "reward": 3.28125, |
| "reward_std": 2.0387277007102966, |
| "rewards/accuracy_reward_staging": 0.1875, |
| "rewards/format_reward": 0.703125, |
| "rewards/format_reward_staging": 0.703125, |
| "step": 207 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 98.84375, |
| "epoch": 104.0, |
| "grad_norm": 45.999753153217384, |
| "kl": 2.953125, |
| "learning_rate": 1.1045284632676535e-05, |
| "loss": 0.3512, |
| "reward": 7.703125, |
| "reward_std": 4.490310102701187, |
| "rewards/accuracy_reward_staging": 0.609375, |
| "rewards/format_reward": 0.8125, |
| "rewards/format_reward_staging": 0.796875, |
| "step": 208 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 73.859375, |
| "epoch": 104.5, |
| "grad_norm": 16.39069761676693, |
| "kl": 2.822265625, |
| "learning_rate": 1.0958457525202241e-05, |
| "loss": 0.1927, |
| "reward": 7.15625, |
| "reward_std": 2.440823122859001, |
| "rewards/accuracy_reward_staging": 0.546875, |
| "rewards/format_reward": 0.84375, |
| "rewards/format_reward_staging": 0.84375, |
| "step": 209 |
| }, |
| { |
| "epoch": 105.0, |
| "grad_norm": 32.991938805145374, |
| "learning_rate": 1.0871557427476585e-05, |
| "loss": 0.2606, |
| "step": 210 |
| }, |
| { |
| "epoch": 105.0, |
| "eval_clip_ratio": 0.0, |
| "eval_completion_length": 76.171875, |
| "eval_kl": 2.79296875, |
| "eval_loss": 0.3493640124797821, |
| "eval_reward": 6.234375, |
| "eval_reward_std": 2.950258269906044, |
| "eval_rewards/accuracy_reward_staging": 0.453125, |
| "eval_rewards/format_reward": 0.8515625, |
| "eval_rewards/format_reward_staging": 0.8515625, |
| "eval_runtime": 17.079, |
| "eval_samples_per_second": 0.468, |
| "eval_steps_per_second": 0.059, |
| "step": 210 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 80.3046875, |
| "epoch": 105.5, |
| "grad_norm": 14.591066359234059, |
| "kl": 3.3798828125, |
| "learning_rate": 1.0784590957278452e-05, |
| "loss": 0.2564, |
| "reward": 3.3984375, |
| "reward_std": 2.145370692014694, |
| "rewards/accuracy_reward_staging": 0.171875, |
| "rewards/format_reward": 0.8359375, |
| "rewards/format_reward_staging": 0.84375, |
| "step": 211 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 76.625, |
| "epoch": 106.0, |
| "grad_norm": 213.24507222643214, |
| "kl": 15.8828125, |
| "learning_rate": 1.0697564737441254e-05, |
| "loss": 0.8925, |
| "reward": 9.28125, |
| "reward_std": 5.5177276730537415, |
| "rewards/accuracy_reward_staging": 0.765625, |
| "rewards/format_reward": 0.8125, |
| "rewards/format_reward_staging": 0.8125, |
| "step": 212 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 110.328125, |
| "epoch": 106.5, |
| "grad_norm": 374.4514896454632, |
| "kl": 7.72265625, |
| "learning_rate": 1.0610485395348571e-05, |
| "loss": 1.0949, |
| "reward": 5.296875, |
| "reward_std": 2.8596606701612473, |
| "rewards/accuracy_reward_staging": 0.375, |
| "rewards/format_reward": 0.765625, |
| "rewards/format_reward_staging": 0.78125, |
| "step": 213 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 69.875, |
| "epoch": 107.0, |
| "grad_norm": 1105.673343880607, |
| "kl": 33.830078125, |
| "learning_rate": 1.0523359562429441e-05, |
| "loss": 1.6287, |
| "reward": 7.4375, |
| "reward_std": 2.3971213698387146, |
| "rewards/accuracy_reward_staging": 0.5625, |
| "rewards/format_reward": 0.90625, |
| "rewards/format_reward_staging": 0.90625, |
| "step": 214 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 73.890625, |
| "epoch": 107.5, |
| "grad_norm": 123.4698676668202, |
| "kl": 5.3046875, |
| "learning_rate": 1.0436193873653362e-05, |
| "loss": 0.3884, |
| "reward": 5.53125, |
| "reward_std": 2.4318894147872925, |
| "rewards/accuracy_reward_staging": 0.375, |
| "rewards/format_reward": 0.890625, |
| "rewards/format_reward_staging": 0.890625, |
| "step": 215 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 102.03125, |
| "epoch": 108.0, |
| "grad_norm": 160.67968983733866, |
| "kl": 7.8359375, |
| "learning_rate": 1.0348994967025012e-05, |
| "loss": 0.9398, |
| "reward": 6.359375, |
| "reward_std": 4.46427845954895, |
| "rewards/accuracy_reward_staging": 0.484375, |
| "rewards/format_reward": 0.8125, |
| "rewards/format_reward_staging": 0.703125, |
| "step": 216 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 75.0625, |
| "epoch": 108.5, |
| "grad_norm": 25.51936131101817, |
| "kl": 2.392578125, |
| "learning_rate": 1.0261769483078734e-05, |
| "loss": 0.2649, |
| "reward": 1.75, |
| "reward_std": 0.47541579604148865, |
| "rewards/accuracy_reward_staging": 0.0, |
| "rewards/format_reward": 0.921875, |
| "rewards/format_reward_staging": 0.828125, |
| "step": 217 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 137.28125, |
| "epoch": 109.0, |
| "grad_norm": 148.7024918074023, |
| "kl": 5.13671875, |
| "learning_rate": 1.0174524064372837e-05, |
| "loss": 1.1347, |
| "reward": 11.765625, |
| "reward_std": 4.963782727718353, |
| "rewards/accuracy_reward_staging": 1.0, |
| "rewards/format_reward": 0.890625, |
| "rewards/format_reward_staging": 0.875, |
| "step": 218 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 94.1875, |
| "epoch": 109.5, |
| "grad_norm": 42.61961427865945, |
| "kl": 5.0546875, |
| "learning_rate": 1.008726535498374e-05, |
| "loss": 0.8258, |
| "reward": 4.84375, |
| "reward_std": 2.711217939853668, |
| "rewards/accuracy_reward_staging": 0.328125, |
| "rewards/format_reward": 0.765625, |
| "rewards/format_reward_staging": 0.796875, |
| "step": 219 |
| }, |
| { |
| "epoch": 110.0, |
| "grad_norm": 134.36362522399034, |
| "learning_rate": 1e-05, |
| "loss": 1.1661, |
| "step": 220 |
| }, |
| { |
| "epoch": 110.0, |
| "eval_clip_ratio": 0.0, |
| "eval_completion_length": 117.09375, |
| "eval_kl": 4.9931640625, |
| "eval_loss": 0.5141848921775818, |
| "eval_reward": 6.1640625, |
| "eval_reward_std": 3.0512350350618362, |
| "eval_rewards/accuracy_reward_staging": 0.4453125, |
| "eval_rewards/format_reward": 0.84375, |
| "eval_rewards/format_reward_staging": 0.8671875, |
| "eval_runtime": 19.2232, |
| "eval_samples_per_second": 0.416, |
| "eval_steps_per_second": 0.052, |
| "step": 220 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 133.65625, |
| "epoch": 110.5, |
| "grad_norm": 105.48263389855896, |
| "kl": 6.9462890625, |
| "learning_rate": 9.912734645016262e-06, |
| "loss": 0.8125, |
| "reward": 5.2578125, |
| "reward_std": 3.310664713382721, |
| "rewards/accuracy_reward_staging": 0.3671875, |
| "rewards/format_reward": 0.8125, |
| "rewards/format_reward_staging": 0.7734375, |
| "step": 221 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 103.78125, |
| "epoch": 111.0, |
| "grad_norm": 28.059069132963547, |
| "kl": 6.390625, |
| "learning_rate": 9.825475935627165e-06, |
| "loss": 0.5255, |
| "reward": 8.9375, |
| "reward_std": 2.479752615094185, |
| "rewards/accuracy_reward_staging": 0.71875, |
| "rewards/format_reward": 0.890625, |
| "rewards/format_reward_staging": 0.859375, |
| "step": 222 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 83.953125, |
| "epoch": 111.5, |
| "grad_norm": 71.56716252183017, |
| "kl": 2.30078125, |
| "learning_rate": 9.738230516921272e-06, |
| "loss": 0.4025, |
| "reward": 1.796875, |
| "reward_std": 0.54747274518013, |
| "rewards/accuracy_reward_staging": 0.0, |
| "rewards/format_reward": 0.890625, |
| "rewards/format_reward_staging": 0.90625, |
| "step": 223 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 100.0, |
| "epoch": 112.0, |
| "grad_norm": 27.218944194745887, |
| "kl": 3.3046875, |
| "learning_rate": 9.651005032974994e-06, |
| "loss": 0.2448, |
| "reward": 10.125, |
| "reward_std": 5.460300043225288, |
| "rewards/accuracy_reward_staging": 0.84375, |
| "rewards/format_reward": 0.875, |
| "rewards/format_reward_staging": 0.8125, |
| "step": 224 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 94.84375, |
| "epoch": 112.5, |
| "grad_norm": 68.22276023866495, |
| "kl": 2.3671875, |
| "learning_rate": 9.563806126346643e-06, |
| "loss": 0.5597, |
| "reward": 7.28125, |
| "reward_std": 2.8067111521959305, |
| "rewards/accuracy_reward_staging": 0.546875, |
| "rewards/format_reward": 0.90625, |
| "rewards/format_reward_staging": 0.90625, |
| "step": 225 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 74.75, |
| "epoch": 113.0, |
| "grad_norm": 49.515502954596094, |
| "kl": 1.8046875, |
| "learning_rate": 9.476640437570562e-06, |
| "loss": 0.4145, |
| "reward": 5.734375, |
| "reward_std": 2.974400073289871, |
| "rewards/accuracy_reward_staging": 0.390625, |
| "rewards/format_reward": 0.9375, |
| "rewards/format_reward_staging": 0.890625, |
| "step": 226 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 84.515625, |
| "epoch": 113.5, |
| "grad_norm": 80.52287464279028, |
| "kl": 1.96484375, |
| "learning_rate": 9.38951460465143e-06, |
| "loss": 0.5935, |
| "reward": 4.671875, |
| "reward_std": 2.0293429493904114, |
| "rewards/accuracy_reward_staging": 0.28125, |
| "rewards/format_reward": 0.9375, |
| "rewards/format_reward_staging": 0.921875, |
| "step": 227 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 78.3125, |
| "epoch": 114.0, |
| "grad_norm": 79.3984300703632, |
| "kl": 9.23046875, |
| "learning_rate": 9.302435262558748e-06, |
| "loss": 0.5448, |
| "reward": 8.84375, |
| "reward_std": 3.9457506239414215, |
| "rewards/accuracy_reward_staging": 0.703125, |
| "rewards/format_reward": 0.90625, |
| "rewards/format_reward_staging": 0.90625, |
| "step": 228 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 82.3125, |
| "epoch": 114.5, |
| "grad_norm": 144.0165272069023, |
| "kl": 7.28125, |
| "learning_rate": 9.215409042721553e-06, |
| "loss": 0.6451, |
| "reward": 5.078125, |
| "reward_std": 2.225419983267784, |
| "rewards/accuracy_reward_staging": 0.328125, |
| "rewards/format_reward": 0.90625, |
| "rewards/format_reward_staging": 0.890625, |
| "step": 229 |
| }, |
| { |
| "epoch": 115.0, |
| "grad_norm": 321.9641788098979, |
| "learning_rate": 9.128442572523418e-06, |
| "loss": 1.5742, |
| "step": 230 |
| }, |
| { |
| "epoch": 115.0, |
| "eval_clip_ratio": 0.0, |
| "eval_completion_length": 96.328125, |
| "eval_kl": 5.0361328125, |
| "eval_loss": 0.9659979939460754, |
| "eval_reward": 6.09375, |
| "eval_reward_std": 2.717326804995537, |
| "eval_rewards/accuracy_reward_staging": 0.4375, |
| "eval_rewards/format_reward": 0.8828125, |
| "eval_rewards/format_reward_staging": 0.8359375, |
| "eval_runtime": 21.7514, |
| "eval_samples_per_second": 0.368, |
| "eval_steps_per_second": 0.046, |
| "step": 230 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 133.8828125, |
| "epoch": 115.5, |
| "grad_norm": 330.9234870737541, |
| "kl": 7.890625, |
| "learning_rate": 9.04154247479776e-06, |
| "loss": 1.4764, |
| "reward": 5.0703125, |
| "reward_std": 3.3657846450805664, |
| "rewards/accuracy_reward_staging": 0.34375, |
| "rewards/format_reward": 0.828125, |
| "rewards/format_reward_staging": 0.8046875, |
| "step": 231 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 76.3125, |
| "epoch": 116.0, |
| "grad_norm": 110.48910505492681, |
| "kl": 4.37890625, |
| "learning_rate": 8.954715367323468e-06, |
| "loss": 0.5441, |
| "reward": 7.890625, |
| "reward_std": 3.448995918035507, |
| "rewards/accuracy_reward_staging": 0.609375, |
| "rewards/format_reward": 0.90625, |
| "rewards/format_reward_staging": 0.890625, |
| "step": 232 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 90.125, |
| "epoch": 116.5, |
| "grad_norm": 24.431265643699565, |
| "kl": 2.525390625, |
| "learning_rate": 8.867967862320935e-06, |
| "loss": 0.498, |
| "reward": 5.21875, |
| "reward_std": 2.0470831990242004, |
| "rewards/accuracy_reward_staging": 0.34375, |
| "rewards/format_reward": 0.90625, |
| "rewards/format_reward_staging": 0.875, |
| "step": 233 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 69.15625, |
| "epoch": 117.0, |
| "grad_norm": 22.167233415343524, |
| "kl": 3.0390625, |
| "learning_rate": 8.781306565948528e-06, |
| "loss": 0.2231, |
| "reward": 7.9375, |
| "reward_std": 2.1919010430574417, |
| "rewards/accuracy_reward_staging": 0.609375, |
| "rewards/format_reward": 0.9375, |
| "rewards/format_reward_staging": 0.90625, |
| "step": 234 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 85.6875, |
| "epoch": 117.5, |
| "grad_norm": 15.990875997337938, |
| "kl": 2.197265625, |
| "learning_rate": 8.694738077799487e-06, |
| "loss": 0.3279, |
| "reward": 5.0, |
| "reward_std": 2.6655498147010803, |
| "rewards/accuracy_reward_staging": 0.328125, |
| "rewards/format_reward": 0.859375, |
| "rewards/format_reward_staging": 0.859375, |
| "step": 235 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 69.34375, |
| "epoch": 118.0, |
| "grad_norm": 17.73495165238854, |
| "kl": 2.447265625, |
| "learning_rate": 8.60826899039935e-06, |
| "loss": -0.0273, |
| "reward": 7.859375, |
| "reward_std": 3.111342281103134, |
| "rewards/accuracy_reward_staging": 0.609375, |
| "rewards/format_reward": 0.875, |
| "rewards/format_reward_staging": 0.890625, |
| "step": 236 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 102.9375, |
| "epoch": 118.5, |
| "grad_norm": 62.41785090382259, |
| "kl": 3.140625, |
| "learning_rate": 8.521905888703894e-06, |
| "loss": 0.4091, |
| "reward": 5.640625, |
| "reward_std": 2.2090050280094147, |
| "rewards/accuracy_reward_staging": 0.390625, |
| "rewards/format_reward": 0.859375, |
| "rewards/format_reward_staging": 0.875, |
| "step": 237 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 99.9375, |
| "epoch": 119.0, |
| "grad_norm": 19.556702226372, |
| "kl": 1.966796875, |
| "learning_rate": 8.43565534959769e-06, |
| "loss": 0.3726, |
| "reward": 7.4375, |
| "reward_std": 3.3722455203533173, |
| "rewards/accuracy_reward_staging": 0.5625, |
| "rewards/format_reward": 0.9375, |
| "rewards/format_reward_staging": 0.875, |
| "step": 238 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 97.28125, |
| "epoch": 119.5, |
| "grad_norm": 46.29528683228946, |
| "kl": 9.9296875, |
| "learning_rate": 8.349523941393224e-06, |
| "loss": 0.8543, |
| "reward": 6.15625, |
| "reward_std": 2.8786120861768723, |
| "rewards/accuracy_reward_staging": 0.4375, |
| "rewards/format_reward": 0.890625, |
| "rewards/format_reward_staging": 0.890625, |
| "step": 239 |
| }, |
| { |
| "epoch": 120.0, |
| "grad_norm": 74.29601903587687, |
| "learning_rate": 8.263518223330698e-06, |
| "loss": 0.6792, |
| "step": 240 |
| }, |
| { |
| "epoch": 120.0, |
| "eval_clip_ratio": 0.0, |
| "eval_completion_length": 85.34375, |
| "eval_kl": 3.4296875, |
| "eval_loss": 0.3411344885826111, |
| "eval_reward": 6.59375, |
| "eval_reward_std": 2.5715944170951843, |
| "eval_rewards/accuracy_reward_staging": 0.4765625, |
| "eval_rewards/format_reward": 0.90625, |
| "eval_rewards/format_reward_staging": 0.921875, |
| "eval_runtime": 14.6233, |
| "eval_samples_per_second": 0.547, |
| "eval_steps_per_second": 0.068, |
| "step": 240 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 76.328125, |
| "epoch": 120.5, |
| "grad_norm": 10.644912372537606, |
| "kl": 2.2607421875, |
| "learning_rate": 8.177644745078525e-06, |
| "loss": 0.0768, |
| "reward": 6.1953125, |
| "reward_std": 2.617633506655693, |
| "rewards/accuracy_reward_staging": 0.4375, |
| "rewards/format_reward": 0.8984375, |
| "rewards/format_reward_staging": 0.921875, |
| "step": 241 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 100.0625, |
| "epoch": 121.0, |
| "grad_norm": 111.92253062146008, |
| "kl": 3.75, |
| "learning_rate": 8.091910046234552e-06, |
| "loss": 0.5913, |
| "reward": 8.21875, |
| "reward_std": 2.894813686609268, |
| "rewards/accuracy_reward_staging": 0.640625, |
| "rewards/format_reward": 0.90625, |
| "rewards/format_reward_staging": 0.90625, |
| "step": 242 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 82.8125, |
| "epoch": 121.5, |
| "grad_norm": 35.99238132907505, |
| "kl": 2.3828125, |
| "learning_rate": 8.00632065582803e-06, |
| "loss": 0.3878, |
| "reward": 4.171875, |
| "reward_std": 1.0875328481197357, |
| "rewards/accuracy_reward_staging": 0.234375, |
| "rewards/format_reward": 0.921875, |
| "rewards/format_reward_staging": 0.90625, |
| "step": 243 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 99.71875, |
| "epoch": 122.0, |
| "grad_norm": 75.25420921507221, |
| "kl": 3.81640625, |
| "learning_rate": 7.92088309182241e-06, |
| "loss": 0.5724, |
| "reward": 9.671875, |
| "reward_std": 3.44248828291893, |
| "rewards/accuracy_reward_staging": 0.78125, |
| "rewards/format_reward": 0.921875, |
| "rewards/format_reward_staging": 0.9375, |
| "step": 244 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 70.609375, |
| "epoch": 122.5, |
| "grad_norm": 2.6776020742038265, |
| "kl": 1.322265625, |
| "learning_rate": 7.835603860618973e-06, |
| "loss": 0.0461, |
| "reward": 5.3125, |
| "reward_std": 2.067808836698532, |
| "rewards/accuracy_reward_staging": 0.34375, |
| "rewards/format_reward": 0.953125, |
| "rewards/format_reward_staging": 0.921875, |
| "step": 245 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 69.90625, |
| "epoch": 123.0, |
| "grad_norm": 122.57486492999168, |
| "kl": 6.97265625, |
| "learning_rate": 7.750489456561351e-06, |
| "loss": 0.5057, |
| "reward": 7.84375, |
| "reward_std": 2.4874102771282196, |
| "rewards/accuracy_reward_staging": 0.59375, |
| "rewards/format_reward": 0.953125, |
| "rewards/format_reward_staging": 0.953125, |
| "step": 246 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 70.46875, |
| "epoch": 123.5, |
| "grad_norm": 7.815509886711204, |
| "kl": 2.482421875, |
| "learning_rate": 7.66554636144095e-06, |
| "loss": -0.0082, |
| "reward": 4.21875, |
| "reward_std": 0.9797288179397583, |
| "rewards/accuracy_reward_staging": 0.234375, |
| "rewards/format_reward": 0.9375, |
| "rewards/format_reward_staging": 0.9375, |
| "step": 247 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 82.28125, |
| "epoch": 124.0, |
| "grad_norm": 45.837077069450096, |
| "kl": 2.609375, |
| "learning_rate": 7.580781044003324e-06, |
| "loss": 0.7357, |
| "reward": 9.953125, |
| "reward_std": 3.699725955724716, |
| "rewards/accuracy_reward_staging": 0.8125, |
| "rewards/format_reward": 0.90625, |
| "rewards/format_reward_staging": 0.921875, |
| "step": 248 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 71.984375, |
| "epoch": 124.5, |
| "grad_norm": 2.8144133334907924, |
| "kl": 1.357421875, |
| "learning_rate": 7.496199959455584e-06, |
| "loss": 0.0622, |
| "reward": 5.71875, |
| "reward_std": 1.41599440574646, |
| "rewards/accuracy_reward_staging": 0.375, |
| "rewards/format_reward": 1.0, |
| "rewards/format_reward_staging": 0.96875, |
| "step": 249 |
| }, |
| { |
| "epoch": 125.0, |
| "grad_norm": 45.183676518213446, |
| "learning_rate": 7.411809548974792e-06, |
| "loss": 0.6075, |
| "step": 250 |
| }, |
| { |
| "epoch": 125.0, |
| "eval_clip_ratio": 0.0, |
| "eval_completion_length": 86.328125, |
| "eval_kl": 2.1416015625, |
| "eval_loss": 0.3427232503890991, |
| "eval_reward": 7.1171875, |
| "eval_reward_std": 2.1937270909547806, |
| "eval_rewards/accuracy_reward_staging": 0.5234375, |
| "eval_rewards/format_reward": 0.9375, |
| "eval_rewards/format_reward_staging": 0.9453125, |
| "eval_runtime": 20.0325, |
| "eval_samples_per_second": 0.399, |
| "eval_steps_per_second": 0.05, |
| "step": 250 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 117.84375, |
| "epoch": 125.5, |
| "grad_norm": 49.63285548086623, |
| "kl": 4.2412109375, |
| "learning_rate": 7.327616239217432e-06, |
| "loss": 0.4637, |
| "reward": 8.2421875, |
| "reward_std": 2.541686810553074, |
| "rewards/accuracy_reward_staging": 0.640625, |
| "rewards/format_reward": 0.90625, |
| "rewards/format_reward_staging": 0.9296875, |
| "step": 251 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 77.84375, |
| "epoch": 126.0, |
| "grad_norm": 92.39572072696474, |
| "kl": 6.615234375, |
| "learning_rate": 7.243626441830009e-06, |
| "loss": 0.5525, |
| "reward": 6.28125, |
| "reward_std": 1.9668870717287064, |
| "rewards/accuracy_reward_staging": 0.453125, |
| "rewards/format_reward": 0.859375, |
| "rewards/format_reward_staging": 0.890625, |
| "step": 252 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 101.203125, |
| "epoch": 126.5, |
| "grad_norm": 10902.1669383775, |
| "kl": 80.533203125, |
| "learning_rate": 7.159846552960774e-06, |
| "loss": 17.4523, |
| "reward": 10.0625, |
| "reward_std": 3.160782814025879, |
| "rewards/accuracy_reward_staging": 0.8125, |
| "rewards/format_reward": 0.96875, |
| "rewards/format_reward_staging": 0.96875, |
| "step": 253 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 100.96875, |
| "epoch": 127.0, |
| "grad_norm": 59.42071850320288, |
| "kl": 2.439453125, |
| "learning_rate": 7.076282952772634e-06, |
| "loss": 0.5484, |
| "reward": 4.140625, |
| "reward_std": 1.1416241526603699, |
| "rewards/accuracy_reward_staging": 0.234375, |
| "rewards/format_reward": 0.875, |
| "rewards/format_reward_staging": 0.921875, |
| "step": 254 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 99.296875, |
| "epoch": 127.5, |
| "grad_norm": 39.19631141515369, |
| "kl": 3.0078125, |
| "learning_rate": 6.992942004957271e-06, |
| "loss": 0.4412, |
| "reward": 6.015625, |
| "reward_std": 1.9026378691196442, |
| "rewards/accuracy_reward_staging": 0.421875, |
| "rewards/format_reward": 0.890625, |
| "rewards/format_reward_staging": 0.90625, |
| "step": 255 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 69.9375, |
| "epoch": 128.0, |
| "grad_norm": 32.45114944744183, |
| "kl": 3.859375, |
| "learning_rate": 6.909830056250527e-06, |
| "loss": 0.0636, |
| "reward": 8.984375, |
| "reward_std": 1.82603120803833, |
| "rewards/accuracy_reward_staging": 0.703125, |
| "rewards/format_reward": 0.96875, |
| "rewards/format_reward_staging": 0.984375, |
| "step": 256 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 83.421875, |
| "epoch": 128.5, |
| "grad_norm": 37.72964417808751, |
| "kl": 2.177734375, |
| "learning_rate": 6.826953435949081e-06, |
| "loss": 0.4592, |
| "reward": 8.4375, |
| "reward_std": 1.9335529208183289, |
| "rewards/accuracy_reward_staging": 0.65625, |
| "rewards/format_reward": 0.9375, |
| "rewards/format_reward_staging": 0.9375, |
| "step": 257 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 99.84375, |
| "epoch": 129.0, |
| "grad_norm": 29.569155006053123, |
| "kl": 2.361328125, |
| "learning_rate": 6.744318455428436e-06, |
| "loss": 0.6722, |
| "reward": 5.40625, |
| "reward_std": 2.279918909072876, |
| "rewards/accuracy_reward_staging": 0.359375, |
| "rewards/format_reward": 0.90625, |
| "rewards/format_reward_staging": 0.90625, |
| "step": 258 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 113.359375, |
| "epoch": 129.5, |
| "grad_norm": 35.01023482911739, |
| "kl": 2.0, |
| "learning_rate": 6.661931407662292e-06, |
| "loss": 0.83, |
| "reward": 8.0625, |
| "reward_std": 3.0837645530700684, |
| "rewards/accuracy_reward_staging": 0.625, |
| "rewards/format_reward": 0.890625, |
| "rewards/format_reward_staging": 0.921875, |
| "step": 259 |
| }, |
| { |
| "epoch": 130.0, |
| "grad_norm": 8.168155459606666, |
| "learning_rate": 6.579798566743314e-06, |
| "loss": -0.0564, |
| "step": 260 |
| }, |
| { |
| "epoch": 130.0, |
| "eval_clip_ratio": 0.0, |
| "eval_completion_length": 92.515625, |
| "eval_kl": 4.40625, |
| "eval_loss": 0.7453440427780151, |
| "eval_reward": 7.3203125, |
| "eval_reward_std": 2.501967176795006, |
| "eval_rewards/accuracy_reward_staging": 0.546875, |
| "eval_rewards/format_reward": 0.9140625, |
| "eval_rewards/format_reward_staging": 0.9375, |
| "eval_runtime": 18.9405, |
| "eval_samples_per_second": 0.422, |
| "eval_steps_per_second": 0.053, |
| "step": 260 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 91.9921875, |
| "epoch": 130.5, |
| "grad_norm": 82.11170212285718, |
| "kl": 3.4697265625, |
| "learning_rate": 6.497926187405326e-06, |
| "loss": 1.0187, |
| "reward": 7.8125, |
| "reward_std": 3.268296256661415, |
| "rewards/accuracy_reward_staging": 0.6015625, |
| "rewards/format_reward": 0.8828125, |
| "rewards/format_reward_staging": 0.9140625, |
| "step": 261 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 69.40625, |
| "epoch": 131.0, |
| "grad_norm": 59.6837830116688, |
| "kl": 4.412109375, |
| "learning_rate": 6.4163205045469975e-06, |
| "loss": 0.529, |
| "reward": 4.15625, |
| "reward_std": 1.1744744330644608, |
| "rewards/accuracy_reward_staging": 0.234375, |
| "rewards/format_reward": 0.890625, |
| "rewards/format_reward_staging": 0.921875, |
| "step": 262 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 100.359375, |
| "epoch": 131.5, |
| "grad_norm": 23.5284023934794, |
| "kl": 1.95703125, |
| "learning_rate": 6.334987732757028e-06, |
| "loss": 0.4948, |
| "reward": 9.71875, |
| "reward_std": 4.248636841773987, |
| "rewards/accuracy_reward_staging": 0.78125, |
| "rewards/format_reward": 0.953125, |
| "rewards/format_reward_staging": 0.953125, |
| "step": 263 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 109.03125, |
| "epoch": 132.0, |
| "grad_norm": 30.888217422896634, |
| "kl": 2.41796875, |
| "learning_rate": 6.25393406584088e-06, |
| "loss": 0.4399, |
| "reward": 4.390625, |
| "reward_std": 0.2640564441680908, |
| "rewards/accuracy_reward_staging": 0.25, |
| "rewards/format_reward": 0.9375, |
| "rewards/format_reward_staging": 0.953125, |
| "step": 264 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 116.28125, |
| "epoch": 132.5, |
| "grad_norm": 65.88875351387496, |
| "kl": 3.296875, |
| "learning_rate": 6.173165676349103e-06, |
| "loss": 0.9426, |
| "reward": 5.6875, |
| "reward_std": 2.6246196627616882, |
| "rewards/accuracy_reward_staging": 0.390625, |
| "rewards/format_reward": 0.90625, |
| "rewards/format_reward_staging": 0.875, |
| "step": 265 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 108.25, |
| "epoch": 133.0, |
| "grad_norm": 28.80675313257115, |
| "kl": 2.587890625, |
| "learning_rate": 6.092688715107265e-06, |
| "loss": 0.7637, |
| "reward": 7.390625, |
| "reward_std": 3.304721415042877, |
| "rewards/accuracy_reward_staging": 0.5625, |
| "rewards/format_reward": 0.875, |
| "rewards/format_reward_staging": 0.890625, |
| "step": 266 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 141.375, |
| "epoch": 133.5, |
| "grad_norm": 102.51920405283484, |
| "kl": 4.94140625, |
| "learning_rate": 6.0125093107475385e-06, |
| "loss": 0.8444, |
| "reward": 9.265625, |
| "reward_std": 3.659609690308571, |
| "rewards/accuracy_reward_staging": 0.75, |
| "rewards/format_reward": 0.890625, |
| "rewards/format_reward_staging": 0.875, |
| "step": 267 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 73.71875, |
| "epoch": 134.0, |
| "grad_norm": 85.74081039820032, |
| "kl": 2.640625, |
| "learning_rate": 5.932633569242e-06, |
| "loss": 0.6961, |
| "reward": 4.15625, |
| "reward_std": 1.1662903726100922, |
| "rewards/accuracy_reward_staging": 0.234375, |
| "rewards/format_reward": 0.9375, |
| "rewards/format_reward_staging": 0.875, |
| "step": 268 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 70.5, |
| "epoch": 134.5, |
| "grad_norm": 220.54398843264903, |
| "kl": 23.98828125, |
| "learning_rate": 5.853067573437612e-06, |
| "loss": 0.4613, |
| "reward": 4.078125, |
| "reward_std": 1.1815360486507416, |
| "rewards/accuracy_reward_staging": 0.21875, |
| "rewards/format_reward": 0.953125, |
| "rewards/format_reward_staging": 0.9375, |
| "step": 269 |
| }, |
| { |
| "epoch": 135.0, |
| "grad_norm": 69.3686388417835, |
| "learning_rate": 5.773817382593008e-06, |
| "loss": 0.8036, |
| "step": 270 |
| }, |
| { |
| "epoch": 135.0, |
| "eval_clip_ratio": 0.0, |
| "eval_completion_length": 133.703125, |
| "eval_kl": 2.3740234375, |
| "eval_loss": 0.5488725304603577, |
| "eval_reward": 6.65625, |
| "eval_reward_std": 2.7503354847431183, |
| "eval_rewards/accuracy_reward_staging": 0.4921875, |
| "eval_rewards/format_reward": 0.8828125, |
| "eval_rewards/format_reward_staging": 0.8515625, |
| "eval_runtime": 25.2428, |
| "eval_samples_per_second": 0.317, |
| "eval_steps_per_second": 0.04, |
| "step": 270 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 87.28125, |
| "epoch": 135.5, |
| "grad_norm": 7.78574597343019, |
| "kl": 2.5751953125, |
| "learning_rate": 5.694889031917047e-06, |
| "loss": 0.0375, |
| "reward": 9.046875, |
| "reward_std": 2.9865086674690247, |
| "rewards/accuracy_reward_staging": 0.7265625, |
| "rewards/format_reward": 0.90625, |
| "rewards/format_reward_staging": 0.875, |
| "step": 271 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 69.125, |
| "epoch": 136.0, |
| "grad_norm": 63.17030825405449, |
| "kl": 1.947265625, |
| "learning_rate": 5.616288532109225e-06, |
| "loss": 0.6616, |
| "reward": 6.046875, |
| "reward_std": 1.967979907989502, |
| "rewards/accuracy_reward_staging": 0.421875, |
| "rewards/format_reward": 0.9375, |
| "rewards/format_reward_staging": 0.890625, |
| "step": 272 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 87.859375, |
| "epoch": 136.5, |
| "grad_norm": 5.340771645114791, |
| "kl": 2.205078125, |
| "learning_rate": 5.5380218689019125e-06, |
| "loss": 0.3324, |
| "reward": 5.265625, |
| "reward_std": 1.8316132873296738, |
| "rewards/accuracy_reward_staging": 0.34375, |
| "rewards/format_reward": 0.921875, |
| "rewards/format_reward_staging": 0.90625, |
| "step": 273 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 178.15625, |
| "epoch": 137.0, |
| "grad_norm": 22.482391196655637, |
| "kl": 3.6875, |
| "learning_rate": 5.460095002604533e-06, |
| "loss": 0.4607, |
| "reward": 6.625, |
| "reward_std": 4.032912701368332, |
| "rewards/accuracy_reward_staging": 0.5, |
| "rewards/format_reward": 0.8125, |
| "rewards/format_reward_staging": 0.8125, |
| "step": 274 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 73.421875, |
| "epoch": 137.5, |
| "grad_norm": 13.527272291689822, |
| "kl": 1.958984375, |
| "learning_rate": 5.382513867649663e-06, |
| "loss": 0.0469, |
| "reward": 6.0625, |
| "reward_std": 2.515269100666046, |
| "rewards/accuracy_reward_staging": 0.421875, |
| "rewards/format_reward": 0.9375, |
| "rewards/format_reward_staging": 0.90625, |
| "step": 275 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 161.09375, |
| "epoch": 138.0, |
| "grad_norm": 610.5487969369213, |
| "kl": 6.744140625, |
| "learning_rate": 5.305284372141095e-06, |
| "loss": 1.6861, |
| "reward": 7.59375, |
| "reward_std": 3.4400684684515, |
| "rewards/accuracy_reward_staging": 0.609375, |
| "rewards/format_reward": 0.75, |
| "rewards/format_reward_staging": 0.75, |
| "step": 276 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 204.046875, |
| "epoch": 138.5, |
| "grad_norm": 40.311818383902995, |
| "kl": 43.591796875, |
| "learning_rate": 5.228412397403916e-06, |
| "loss": 0.7055, |
| "reward": 4.25, |
| "reward_std": 2.983577609062195, |
| "rewards/accuracy_reward_staging": 0.265625, |
| "rewards/format_reward": 0.8125, |
| "rewards/format_reward_staging": 0.78125, |
| "step": 277 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 74.875, |
| "epoch": 139.0, |
| "grad_norm": 38.02177180776487, |
| "kl": 4.14453125, |
| "learning_rate": 5.151903797536631e-06, |
| "loss": 0.6329, |
| "reward": 6.765625, |
| "reward_std": 2.8731206506490707, |
| "rewards/accuracy_reward_staging": 0.5, |
| "rewards/format_reward": 0.890625, |
| "rewards/format_reward_staging": 0.875, |
| "step": 278 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 102.953125, |
| "epoch": 139.5, |
| "grad_norm": 15.046522887761398, |
| "kl": 2.10546875, |
| "learning_rate": 5.075764398965331e-06, |
| "loss": 0.4068, |
| "reward": 1.96875, |
| "reward_std": 1.4316468834877014, |
| "rewards/accuracy_reward_staging": 0.03125, |
| "rewards/format_reward": 0.84375, |
| "rewards/format_reward_staging": 0.8125, |
| "step": 279 |
| }, |
| { |
| "epoch": 140.0, |
| "grad_norm": 21.00508786132047, |
| "learning_rate": 5.000000000000003e-06, |
| "loss": 0.3769, |
| "step": 280 |
| }, |
| { |
| "epoch": 140.0, |
| "eval_clip_ratio": 0.0, |
| "eval_completion_length": 108.140625, |
| "eval_kl": 2.791015625, |
| "eval_loss": 0.3034168779850006, |
| "eval_reward": 5.671875, |
| "eval_reward_std": 3.041542984545231, |
| "eval_rewards/accuracy_reward_staging": 0.3984375, |
| "eval_rewards/format_reward": 0.8515625, |
| "eval_rewards/format_reward_staging": 0.8359375, |
| "eval_runtime": 26.3078, |
| "eval_samples_per_second": 0.304, |
| "eval_steps_per_second": 0.038, |
| "step": 280 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 170.3359375, |
| "epoch": 140.5, |
| "grad_norm": 83.46938912876466, |
| "kl": 1.9345703125, |
| "learning_rate": 4.924616370392962e-06, |
| "loss": 0.6293, |
| "reward": 7.96875, |
| "reward_std": 4.6416375786066055, |
| "rewards/accuracy_reward_staging": 0.6328125, |
| "rewards/format_reward": 0.8359375, |
| "rewards/format_reward_staging": 0.8046875, |
| "step": 281 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 127.6875, |
| "epoch": 141.0, |
| "grad_norm": 9.042778775609163, |
| "kl": 2.693359375, |
| "learning_rate": 4.849619250899458e-06, |
| "loss": 0.3913, |
| "reward": 5.640625, |
| "reward_std": 2.8702940493822098, |
| "rewards/accuracy_reward_staging": 0.390625, |
| "rewards/format_reward": 0.84375, |
| "rewards/format_reward_staging": 0.890625, |
| "step": 282 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 139.03125, |
| "epoch": 141.5, |
| "grad_norm": 49.6878570546121, |
| "kl": 3.50390625, |
| "learning_rate": 4.775014352840512e-06, |
| "loss": 0.9548, |
| "reward": 6.390625, |
| "reward_std": 3.8147538006305695, |
| "rewards/accuracy_reward_staging": 0.484375, |
| "rewards/format_reward": 0.78125, |
| "rewards/format_reward_staging": 0.765625, |
| "step": 283 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 70.84375, |
| "epoch": 142.0, |
| "grad_norm": 2.936217141963846, |
| "kl": 1.498046875, |
| "learning_rate": 4.700807357667953e-06, |
| "loss": 0.0258, |
| "reward": 5.015625, |
| "reward_std": 1.7188784629106522, |
| "rewards/accuracy_reward_staging": 0.3125, |
| "rewards/format_reward": 0.96875, |
| "rewards/format_reward_staging": 0.921875, |
| "step": 284 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 234.46875, |
| "epoch": 142.5, |
| "grad_norm": 231.51752728996516, |
| "kl": 7.8046875, |
| "learning_rate": 4.627003916531761e-06, |
| "loss": 2.1654, |
| "reward": 3.1875, |
| "reward_std": 2.9490927308797836, |
| "rewards/accuracy_reward_staging": 0.171875, |
| "rewards/format_reward": 0.75, |
| "rewards/format_reward_staging": 0.71875, |
| "step": 285 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 132.8125, |
| "epoch": 143.0, |
| "grad_norm": 55.72832613480593, |
| "kl": 1.64453125, |
| "learning_rate": 4.5536096498497295e-06, |
| "loss": 1.0135, |
| "reward": 7.109375, |
| "reward_std": 2.9616019427776337, |
| "rewards/accuracy_reward_staging": 0.53125, |
| "rewards/format_reward": 0.875, |
| "rewards/format_reward_staging": 0.921875, |
| "step": 286 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 131.984375, |
| "epoch": 143.5, |
| "grad_norm": 500.69423335954184, |
| "kl": 9.09375, |
| "learning_rate": 4.480630146879419e-06, |
| "loss": 0.8396, |
| "reward": 4.234375, |
| "reward_std": 2.296322599053383, |
| "rewards/accuracy_reward_staging": 0.265625, |
| "rewards/format_reward": 0.8125, |
| "rewards/format_reward_staging": 0.765625, |
| "step": 287 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 163.25, |
| "epoch": 144.0, |
| "grad_norm": 78.65105992380144, |
| "kl": 1.7265625, |
| "learning_rate": 4.408070965292534e-06, |
| "loss": 0.5859, |
| "reward": 6.453125, |
| "reward_std": 3.582520604133606, |
| "rewards/accuracy_reward_staging": 0.46875, |
| "rewards/format_reward": 0.890625, |
| "rewards/format_reward_staging": 0.875, |
| "step": 288 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 152.4375, |
| "epoch": 144.5, |
| "grad_norm": 145.11899661299873, |
| "kl": 3.505859375, |
| "learning_rate": 4.335937630751675e-06, |
| "loss": 0.8424, |
| "reward": 3.5, |
| "reward_std": 2.5658179223537445, |
| "rewards/accuracy_reward_staging": 0.1875, |
| "rewards/format_reward": 0.828125, |
| "rewards/format_reward_staging": 0.796875, |
| "step": 289 |
| }, |
| { |
| "epoch": 145.0, |
| "grad_norm": 21.829024770798654, |
| "learning_rate": 4.264235636489542e-06, |
| "loss": 0.4036, |
| "step": 290 |
| }, |
| { |
| "epoch": 145.0, |
| "eval_clip_ratio": 0.0, |
| "eval_completion_length": 118.2890625, |
| "eval_kl": 2.580078125, |
| "eval_loss": 0.45962250232696533, |
| "eval_reward": 5.0390625, |
| "eval_reward_std": 2.569364294409752, |
| "eval_rewards/accuracy_reward_staging": 0.3359375, |
| "eval_rewards/format_reward": 0.8515625, |
| "eval_rewards/format_reward_staging": 0.828125, |
| "eval_runtime": 25.2258, |
| "eval_samples_per_second": 0.317, |
| "eval_steps_per_second": 0.04, |
| "step": 290 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 94.2578125, |
| "epoch": 145.5, |
| "grad_norm": 99.26323743266907, |
| "kl": 2.0009765625, |
| "learning_rate": 4.192970442890602e-06, |
| "loss": 0.9692, |
| "reward": 6.3828125, |
| "reward_std": 2.864999257028103, |
| "rewards/accuracy_reward_staging": 0.4609375, |
| "rewards/format_reward": 0.90625, |
| "rewards/format_reward_staging": 0.8671875, |
| "step": 291 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 242.90625, |
| "epoch": 146.0, |
| "grad_norm": 2195.8990163427725, |
| "kl": 20.560546875, |
| "learning_rate": 4.12214747707527e-06, |
| "loss": 2.8321, |
| "reward": 3.453125, |
| "reward_std": 2.1853206753730774, |
| "rewards/accuracy_reward_staging": 0.1875, |
| "rewards/format_reward": 0.78125, |
| "rewards/format_reward_staging": 0.796875, |
| "step": 292 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 70.984375, |
| "epoch": 146.5, |
| "grad_norm": 5.612016216510325, |
| "kl": 1.904296875, |
| "learning_rate": 4.051772132486589e-06, |
| "loss": -0.0196, |
| "reward": 6.5625, |
| "reward_std": 1.6346493661403656, |
| "rewards/accuracy_reward_staging": 0.46875, |
| "rewards/format_reward": 0.9375, |
| "rewards/format_reward_staging": 0.9375, |
| "step": 293 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 116.75, |
| "epoch": 147.0, |
| "grad_norm": 24.439737033122462, |
| "kl": 5.474609375, |
| "learning_rate": 3.981849768479516e-06, |
| "loss": 1.072, |
| "reward": 3.296875, |
| "reward_std": 2.2417181879281998, |
| "rewards/accuracy_reward_staging": 0.171875, |
| "rewards/format_reward": 0.796875, |
| "rewards/format_reward_staging": 0.78125, |
| "step": 294 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 219.84375, |
| "epoch": 147.5, |
| "grad_norm": 149.74167295056185, |
| "kl": 6.009765625, |
| "learning_rate": 3.912385709912794e-06, |
| "loss": 1.0669, |
| "reward": 4.1875, |
| "reward_std": 2.5385715812444687, |
| "rewards/accuracy_reward_staging": 0.265625, |
| "rewards/format_reward": 0.75, |
| "rewards/format_reward_staging": 0.78125, |
| "step": 295 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 98.15625, |
| "epoch": 148.0, |
| "grad_norm": 19.302640459669014, |
| "kl": 2.12109375, |
| "learning_rate": 3.8433852467434175e-06, |
| "loss": 0.4691, |
| "reward": 6.453125, |
| "reward_std": 2.6673848778009415, |
| "rewards/accuracy_reward_staging": 0.46875, |
| "rewards/format_reward": 0.875, |
| "rewards/format_reward_staging": 0.890625, |
| "step": 296 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 190.71875, |
| "epoch": 148.5, |
| "grad_norm": 20.309279907380233, |
| "kl": 3.623046875, |
| "learning_rate": 3.774853633623806e-06, |
| "loss": 0.8401, |
| "reward": 4.875, |
| "reward_std": 2.2293783873319626, |
| "rewards/accuracy_reward_staging": 0.328125, |
| "rewards/format_reward": 0.796875, |
| "rewards/format_reward_staging": 0.796875, |
| "step": 297 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 131.5, |
| "epoch": 149.0, |
| "grad_norm": 239.5186164999186, |
| "kl": 3.22265625, |
| "learning_rate": 3.7067960895016277e-06, |
| "loss": 1.4061, |
| "reward": 6.734375, |
| "reward_std": 3.5877325236797333, |
| "rewards/accuracy_reward_staging": 0.5, |
| "rewards/format_reward": 0.875, |
| "rewards/format_reward_staging": 0.859375, |
| "step": 298 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 132.84375, |
| "epoch": 149.5, |
| "grad_norm": 14.77833452952522, |
| "kl": 2.74609375, |
| "learning_rate": 3.6392177972223596e-06, |
| "loss": 1.032, |
| "reward": 3.984375, |
| "reward_std": 1.3325151801109314, |
| "rewards/accuracy_reward_staging": 0.234375, |
| "rewards/format_reward": 0.796875, |
| "rewards/format_reward_staging": 0.84375, |
| "step": 299 |
| }, |
| { |
| "epoch": 150.0, |
| "grad_norm": 11.339677519890218, |
| "learning_rate": 3.5721239031346067e-06, |
| "loss": 0.9908, |
| "step": 300 |
| }, |
| { |
| "epoch": 150.0, |
| "eval_clip_ratio": 0.0, |
| "eval_completion_length": 153.859375, |
| "eval_kl": 2.41015625, |
| "eval_loss": 0.36936071515083313, |
| "eval_reward": 5.0234375, |
| "eval_reward_std": 2.693635329604149, |
| "eval_rewards/accuracy_reward_staging": 0.3359375, |
| "eval_rewards/format_reward": 0.8203125, |
| "eval_rewards/format_reward_staging": 0.84375, |
| "eval_runtime": 30.2609, |
| "eval_samples_per_second": 0.264, |
| "eval_steps_per_second": 0.033, |
| "step": 300 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 105.4609375, |
| "epoch": 150.5, |
| "grad_norm": 13.48307651344578, |
| "kl": 2.556640625, |
| "learning_rate": 3.505519516698165e-06, |
| "loss": 0.8506, |
| "reward": 5.8515625, |
| "reward_std": 2.9722983986139297, |
| "rewards/accuracy_reward_staging": 0.4140625, |
| "rewards/format_reward": 0.84375, |
| "rewards/format_reward_staging": 0.8671875, |
| "step": 301 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 148.3125, |
| "epoch": 151.0, |
| "grad_norm": 86.33754338379147, |
| "kl": 4.666015625, |
| "learning_rate": 3.4394097100949286e-06, |
| "loss": 0.7356, |
| "reward": 6.453125, |
| "reward_std": 3.2461503744125366, |
| "rewards/accuracy_reward_staging": 0.484375, |
| "rewards/format_reward": 0.8125, |
| "rewards/format_reward_staging": 0.796875, |
| "step": 302 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 124.109375, |
| "epoch": 151.5, |
| "grad_norm": 10.864856379472597, |
| "kl": 2.119140625, |
| "learning_rate": 3.3737995178426276e-06, |
| "loss": 0.6197, |
| "reward": 10.125, |
| "reward_std": 4.969914525747299, |
| "rewards/accuracy_reward_staging": 0.84375, |
| "rewards/format_reward": 0.828125, |
| "rewards/format_reward_staging": 0.859375, |
| "step": 303 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 126.375, |
| "epoch": 152.0, |
| "grad_norm": 14.28207912655626, |
| "kl": 6.5078125, |
| "learning_rate": 3.308693936411421e-06, |
| "loss": 0.7237, |
| "reward": 1.8125, |
| "reward_std": 1.1962126940488815, |
| "rewards/accuracy_reward_staging": 0.015625, |
| "rewards/format_reward": 0.84375, |
| "rewards/format_reward_staging": 0.8125, |
| "step": 304 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 70.15625, |
| "epoch": 152.5, |
| "grad_norm": 129.83316187917453, |
| "kl": 5.720703125, |
| "learning_rate": 3.2440979238433977e-06, |
| "loss": 0.2064, |
| "reward": 4.328125, |
| "reward_std": 0.47754141688346863, |
| "rewards/accuracy_reward_staging": 0.25, |
| "rewards/format_reward": 0.90625, |
| "rewards/format_reward_staging": 0.921875, |
| "step": 305 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 168.125, |
| "epoch": 153.0, |
| "grad_norm": 14.37299684325007, |
| "kl": 3.173828125, |
| "learning_rate": 3.1800163993750166e-06, |
| "loss": 0.4395, |
| "reward": 8.953125, |
| "reward_std": 4.095484673976898, |
| "rewards/accuracy_reward_staging": 0.71875, |
| "rewards/format_reward": 0.875, |
| "rewards/format_reward_staging": 0.890625, |
| "step": 306 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 167.015625, |
| "epoch": 153.5, |
| "grad_norm": 6089.787677039342, |
| "kl": 134.119140625, |
| "learning_rate": 3.116454243062459e-06, |
| "loss": 6.8109, |
| "reward": 4.640625, |
| "reward_std": 2.973371237516403, |
| "rewards/accuracy_reward_staging": 0.296875, |
| "rewards/format_reward": 0.828125, |
| "rewards/format_reward_staging": 0.84375, |
| "step": 307 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 100.0, |
| "epoch": 154.0, |
| "grad_norm": 18.82555217372982, |
| "kl": 1.556640625, |
| "learning_rate": 3.0534162954100264e-06, |
| "loss": 0.7008, |
| "reward": 8.09375, |
| "reward_std": 2.37243390083313, |
| "rewards/accuracy_reward_staging": 0.625, |
| "rewards/format_reward": 0.921875, |
| "rewards/format_reward_staging": 0.921875, |
| "step": 308 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 145.140625, |
| "epoch": 154.5, |
| "grad_norm": 65.27226167321199, |
| "kl": 8.71484375, |
| "learning_rate": 2.990907357001491e-06, |
| "loss": 0.7082, |
| "reward": 7.609375, |
| "reward_std": 2.803087517619133, |
| "rewards/accuracy_reward_staging": 0.59375, |
| "rewards/format_reward": 0.84375, |
| "rewards/format_reward_staging": 0.828125, |
| "step": 309 |
| }, |
| { |
| "epoch": 155.0, |
| "grad_norm": 11.996878111801491, |
| "learning_rate": 2.9289321881345257e-06, |
| "loss": 0.412, |
| "step": 310 |
| }, |
| { |
| "epoch": 155.0, |
| "eval_clip_ratio": 0.0, |
| "eval_completion_length": 143.1015625, |
| "eval_kl": 1.8232421875, |
| "eval_loss": 1.0165082216262817, |
| "eval_reward": 6.234375, |
| "eval_reward_std": 2.7126059383153915, |
| "eval_rewards/accuracy_reward_staging": 0.4453125, |
| "eval_rewards/format_reward": 0.890625, |
| "eval_rewards/format_reward_staging": 0.890625, |
| "eval_runtime": 43.8596, |
| "eval_samples_per_second": 0.182, |
| "eval_steps_per_second": 0.023, |
| "step": 310 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 116.7578125, |
| "epoch": 155.5, |
| "grad_norm": 20.569051772890276, |
| "kl": 1.759765625, |
| "learning_rate": 2.867495508458186e-06, |
| "loss": 0.2798, |
| "reward": 4.6484375, |
| "reward_std": 1.7857269644737244, |
| "rewards/accuracy_reward_staging": 0.28125, |
| "rewards/format_reward": 0.9140625, |
| "rewards/format_reward_staging": 0.921875, |
| "step": 311 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 119.875, |
| "epoch": 156.0, |
| "grad_norm": 66.09995163248371, |
| "kl": 2.607421875, |
| "learning_rate": 2.8066019966134907e-06, |
| "loss": 1.0753, |
| "reward": 8.84375, |
| "reward_std": 4.13113260269165, |
| "rewards/accuracy_reward_staging": 0.71875, |
| "rewards/format_reward": 0.8125, |
| "rewards/format_reward_staging": 0.84375, |
| "step": 312 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 102.953125, |
| "epoch": 156.5, |
| "grad_norm": 8.187280314820471, |
| "kl": 3.30859375, |
| "learning_rate": 2.746256289877126e-06, |
| "loss": 0.3809, |
| "reward": 9.421875, |
| "reward_std": 3.6460390239953995, |
| "rewards/accuracy_reward_staging": 0.765625, |
| "rewards/format_reward": 0.875, |
| "rewards/format_reward_staging": 0.890625, |
| "step": 313 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 131.0625, |
| "epoch": 157.0, |
| "grad_norm": 5.144738650630215, |
| "kl": 2.107421875, |
| "learning_rate": 2.6864629838082957e-06, |
| "loss": 0.6558, |
| "reward": 4.109375, |
| "reward_std": 1.1705190539360046, |
| "rewards/accuracy_reward_staging": 0.234375, |
| "rewards/format_reward": 0.875, |
| "rewards/format_reward_staging": 0.890625, |
| "step": 314 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 116.03125, |
| "epoch": 157.5, |
| "grad_norm": 25.762327223945867, |
| "kl": 3.041015625, |
| "learning_rate": 2.6272266318987606e-06, |
| "loss": 0.7186, |
| "reward": 8.234375, |
| "reward_std": 2.516293704509735, |
| "rewards/accuracy_reward_staging": 0.640625, |
| "rewards/format_reward": 0.90625, |
| "rewards/format_reward_staging": 0.921875, |
| "step": 315 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 106.53125, |
| "epoch": 158.0, |
| "grad_norm": 13.742094957162939, |
| "kl": 2.0859375, |
| "learning_rate": 2.5685517452260566e-06, |
| "loss": 0.6218, |
| "reward": 5.78125, |
| "reward_std": 2.4056650549173355, |
| "rewards/accuracy_reward_staging": 0.40625, |
| "rewards/format_reward": 0.859375, |
| "rewards/format_reward_staging": 0.859375, |
| "step": 316 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 126.1875, |
| "epoch": 158.5, |
| "grad_norm": 16.04349339503592, |
| "kl": 2.7265625, |
| "learning_rate": 2.5104427921099783e-06, |
| "loss": 0.3688, |
| "reward": 5.984375, |
| "reward_std": 2.61943382024765, |
| "rewards/accuracy_reward_staging": 0.421875, |
| "rewards/format_reward": 0.875, |
| "rewards/format_reward_staging": 0.890625, |
| "step": 317 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 99.15625, |
| "epoch": 159.0, |
| "grad_norm": 9.050419274649617, |
| "kl": 4.8828125, |
| "learning_rate": 2.45290419777228e-06, |
| "loss": 0.2878, |
| "reward": 6.5625, |
| "reward_std": 3.38299697637558, |
| "rewards/accuracy_reward_staging": 0.484375, |
| "rewards/format_reward": 0.859375, |
| "rewards/format_reward_staging": 0.859375, |
| "step": 318 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 169.609375, |
| "epoch": 159.5, |
| "grad_norm": 29.02785370667893, |
| "kl": 4.904296875, |
| "learning_rate": 2.395940343999691e-06, |
| "loss": 0.5377, |
| "reward": 5.65625, |
| "reward_std": 2.224781885743141, |
| "rewards/accuracy_reward_staging": 0.40625, |
| "rewards/format_reward": 0.796875, |
| "rewards/format_reward_staging": 0.796875, |
| "step": 319 |
| }, |
| { |
| "epoch": 160.0, |
| "grad_norm": 147.1237773705263, |
| "learning_rate": 2.339555568810221e-06, |
| "loss": 0.9691, |
| "step": 320 |
| }, |
| { |
| "epoch": 160.0, |
| "eval_clip_ratio": 0.0, |
| "eval_completion_length": 167.6640625, |
| "eval_kl": 1.9326171875, |
| "eval_loss": 0.7716435790061951, |
| "eval_reward": 6.6484375, |
| "eval_reward_std": 2.216530680656433, |
| "eval_rewards/accuracy_reward_staging": 0.4921875, |
| "eval_rewards/format_reward": 0.84375, |
| "eval_rewards/format_reward_staging": 0.8828125, |
| "eval_runtime": 31.1003, |
| "eval_samples_per_second": 0.257, |
| "eval_steps_per_second": 0.032, |
| "step": 320 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 117.1796875, |
| "epoch": 160.5, |
| "grad_norm": 123.65805346609463, |
| "kl": 7.9814453125, |
| "learning_rate": 2.2837541661228024e-06, |
| "loss": 0.9258, |
| "reward": 5.8671875, |
| "reward_std": 1.9323227554559708, |
| "rewards/accuracy_reward_staging": 0.4140625, |
| "rewards/format_reward": 0.8359375, |
| "rewards/format_reward_staging": 0.890625, |
| "step": 321 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 189.71875, |
| "epoch": 161.0, |
| "grad_norm": 4.10729901700796, |
| "kl": 2.5625, |
| "learning_rate": 2.2285403854302912e-06, |
| "loss": 0.5784, |
| "reward": 7.734375, |
| "reward_std": 5.214049249887466, |
| "rewards/accuracy_reward_staging": 0.625, |
| "rewards/format_reward": 0.734375, |
| "rewards/format_reward_staging": 0.75, |
| "step": 322 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 218.84375, |
| "epoch": 161.5, |
| "grad_norm": 26.92757845878815, |
| "kl": 2.748046875, |
| "learning_rate": 2.173918431475861e-06, |
| "loss": 0.8819, |
| "reward": 9.484375, |
| "reward_std": 5.199484676122665, |
| "rewards/accuracy_reward_staging": 0.796875, |
| "rewards/format_reward": 0.75, |
| "rewards/format_reward_staging": 0.765625, |
| "step": 323 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 117.75, |
| "epoch": 162.0, |
| "grad_norm": 13.57050998781947, |
| "kl": 1.6328125, |
| "learning_rate": 2.119892463932781e-06, |
| "loss": 0.6371, |
| "reward": 1.671875, |
| "reward_std": 0.47480545938014984, |
| "rewards/accuracy_reward_staging": 0.0, |
| "rewards/format_reward": 0.8125, |
| "rewards/format_reward_staging": 0.859375, |
| "step": 324 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 157.125, |
| "epoch": 162.5, |
| "grad_norm": 4.089653024083605, |
| "kl": 2.189453125, |
| "learning_rate": 2.0664665970876496e-06, |
| "loss": 0.1603, |
| "reward": 3.640625, |
| "reward_std": 3.069836288690567, |
| "rewards/accuracy_reward_staging": 0.203125, |
| "rewards/format_reward": 0.828125, |
| "rewards/format_reward_staging": 0.78125, |
| "step": 325 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 224.4375, |
| "epoch": 163.0, |
| "grad_norm": 17.786059949816178, |
| "kl": 5.447265625, |
| "learning_rate": 2.013644899527074e-06, |
| "loss": 1.1477, |
| "reward": 7.015625, |
| "reward_std": 3.553600549697876, |
| "rewards/accuracy_reward_staging": 0.546875, |
| "rewards/format_reward": 0.734375, |
| "rewards/format_reward_staging": 0.8125, |
| "step": 326 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 189.75, |
| "epoch": 163.5, |
| "grad_norm": 9.621427088420068, |
| "kl": 9.373046875, |
| "learning_rate": 1.961431393827827e-06, |
| "loss": 0.7149, |
| "reward": 4.359375, |
| "reward_std": 3.1985532343387604, |
| "rewards/accuracy_reward_staging": 0.296875, |
| "rewards/format_reward": 0.6875, |
| "rewards/format_reward_staging": 0.703125, |
| "step": 327 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 229.59375, |
| "epoch": 164.0, |
| "grad_norm": 76.47407109945985, |
| "kl": 3.7265625, |
| "learning_rate": 1.9098300562505266e-06, |
| "loss": 0.6987, |
| "reward": 3.78125, |
| "reward_std": 3.0820604413747787, |
| "rewards/accuracy_reward_staging": 0.25, |
| "rewards/format_reward": 0.640625, |
| "rewards/format_reward_staging": 0.640625, |
| "step": 328 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 230.1875, |
| "epoch": 164.5, |
| "grad_norm": 6.994840256019957, |
| "kl": 2.673828125, |
| "learning_rate": 1.858844816436809e-06, |
| "loss": 0.3282, |
| "reward": 5.0625, |
| "reward_std": 3.695622056722641, |
| "rewards/accuracy_reward_staging": 0.359375, |
| "rewards/format_reward": 0.71875, |
| "rewards/format_reward_staging": 0.75, |
| "step": 329 |
| }, |
| { |
| "epoch": 165.0, |
| "grad_norm": 9.165691965744147, |
| "learning_rate": 1.808479557110081e-06, |
| "loss": 0.2334, |
| "step": 330 |
| }, |
| { |
| "epoch": 165.0, |
| "eval_clip_ratio": 0.0, |
| "eval_completion_length": 101.296875, |
| "eval_kl": 3.767578125, |
| "eval_loss": 0.18460440635681152, |
| "eval_reward": 5.234375, |
| "eval_reward_std": 2.932789586484432, |
| "eval_rewards/accuracy_reward_staging": 0.3671875, |
| "eval_rewards/format_reward": 0.7734375, |
| "eval_rewards/format_reward_staging": 0.7890625, |
| "eval_runtime": 19.4506, |
| "eval_samples_per_second": 0.411, |
| "eval_steps_per_second": 0.051, |
| "step": 330 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 120.9609375, |
| "epoch": 165.5, |
| "grad_norm": 7.198852766772789, |
| "kl": 3.2666015625, |
| "learning_rate": 1.7587381137798432e-06, |
| "loss": 0.0493, |
| "reward": 4.8671875, |
| "reward_std": 1.5325812175869942, |
| "rewards/accuracy_reward_staging": 0.3203125, |
| "rewards/format_reward": 0.828125, |
| "rewards/format_reward_staging": 0.8359375, |
| "step": 331 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 247.46875, |
| "epoch": 166.0, |
| "grad_norm": 8.940874552922875, |
| "kl": 7.59765625, |
| "learning_rate": 1.709624274449584e-06, |
| "loss": 0.2349, |
| "reward": 6.109375, |
| "reward_std": 4.367333948612213, |
| "rewards/accuracy_reward_staging": 0.5, |
| "rewards/format_reward": 0.546875, |
| "rewards/format_reward_staging": 0.5625, |
| "step": 332 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 128.6875, |
| "epoch": 166.5, |
| "grad_norm": 29.279440961120276, |
| "kl": 4.328125, |
| "learning_rate": 1.6611417793283192e-06, |
| "loss": 1.111, |
| "reward": 7.34375, |
| "reward_std": 3.199866473674774, |
| "rewards/accuracy_reward_staging": 0.5625, |
| "rewards/format_reward": 0.828125, |
| "rewards/format_reward_staging": 0.890625, |
| "step": 333 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 164.21875, |
| "epoch": 167.0, |
| "grad_norm": 7.572100523744779, |
| "kl": 3.65625, |
| "learning_rate": 1.6132943205457607e-06, |
| "loss": 0.2248, |
| "reward": 2.0625, |
| "reward_std": 2.6606018245220184, |
| "rewards/accuracy_reward_staging": 0.09375, |
| "rewards/format_reward": 0.546875, |
| "rewards/format_reward_staging": 0.578125, |
| "step": 334 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 117.0, |
| "epoch": 167.5, |
| "grad_norm": 38.94662493791989, |
| "kl": 2.041015625, |
| "learning_rate": 1.566085541871145e-06, |
| "loss": 0.9721, |
| "reward": 2.0, |
| "reward_std": 1.3283163905143738, |
| "rewards/accuracy_reward_staging": 0.03125, |
| "rewards/format_reward": 0.828125, |
| "rewards/format_reward_staging": 0.859375, |
| "step": 335 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 257.84375, |
| "epoch": 168.0, |
| "grad_norm": 12.030868864292964, |
| "kl": 2.9765625, |
| "learning_rate": 1.5195190384357405e-06, |
| "loss": 0.347, |
| "reward": 7.15625, |
| "reward_std": 5.804089158773422, |
| "rewards/accuracy_reward_staging": 0.609375, |
| "rewards/format_reward": 0.546875, |
| "rewards/format_reward_staging": 0.515625, |
| "step": 336 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 149.484375, |
| "epoch": 168.5, |
| "grad_norm": 4.888452594544771, |
| "kl": 2.025390625, |
| "learning_rate": 1.4735983564590784e-06, |
| "loss": 0.1412, |
| "reward": 5.875, |
| "reward_std": 1.471491515636444, |
| "rewards/accuracy_reward_staging": 0.4375, |
| "rewards/format_reward": 0.734375, |
| "rewards/format_reward_staging": 0.765625, |
| "step": 337 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 124.65625, |
| "epoch": 169.0, |
| "grad_norm": 14.198275343939475, |
| "kl": 6.076171875, |
| "learning_rate": 1.4283269929788779e-06, |
| "loss": 0.1445, |
| "reward": 5.4375, |
| "reward_std": 3.588435083627701, |
| "rewards/accuracy_reward_staging": 0.390625, |
| "rewards/format_reward": 0.75, |
| "rewards/format_reward_staging": 0.78125, |
| "step": 338 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 138.421875, |
| "epoch": 169.5, |
| "grad_norm": 58.80975012160593, |
| "kl": 4.6015625, |
| "learning_rate": 1.3837083955847418e-06, |
| "loss": 0.4146, |
| "reward": 4.328125, |
| "reward_std": 2.9866636991500854, |
| "rewards/accuracy_reward_staging": 0.28125, |
| "rewards/format_reward": 0.75, |
| "rewards/format_reward_staging": 0.765625, |
| "step": 339 |
| }, |
| { |
| "epoch": 170.0, |
| "grad_norm": 12.936108165176186, |
| "learning_rate": 1.339745962155613e-06, |
| "loss": 0.4075, |
| "step": 340 |
| }, |
| { |
| "epoch": 170.0, |
| "eval_clip_ratio": 0.0, |
| "eval_completion_length": 158.3125, |
| "eval_kl": 3.1328125, |
| "eval_loss": 0.5399841070175171, |
| "eval_reward": 4.9921875, |
| "eval_reward_std": 3.382713630795479, |
| "eval_rewards/accuracy_reward_staging": 0.3515625, |
| "eval_rewards/format_reward": 0.734375, |
| "eval_rewards/format_reward_staging": 0.7421875, |
| "eval_runtime": 32.245, |
| "eval_samples_per_second": 0.248, |
| "eval_steps_per_second": 0.031, |
| "step": 340 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 127.2109375, |
| "epoch": 170.5, |
| "grad_norm": 16.397843407801986, |
| "kl": 5.271484375, |
| "learning_rate": 1.2964430406010032e-06, |
| "loss": 0.4235, |
| "reward": 5.1875, |
| "reward_std": 2.5617306530475616, |
| "rewards/accuracy_reward_staging": 0.375, |
| "rewards/format_reward": 0.7109375, |
| "rewards/format_reward_staging": 0.7265625, |
| "step": 341 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 181.03125, |
| "epoch": 171.0, |
| "grad_norm": 10.177450513861592, |
| "kl": 6.2265625, |
| "learning_rate": 1.2538029286060428e-06, |
| "loss": 0.2828, |
| "reward": 4.0, |
| "reward_std": 3.566714286804199, |
| "rewards/accuracy_reward_staging": 0.265625, |
| "rewards/format_reward": 0.65625, |
| "rewards/format_reward_staging": 0.6875, |
| "step": 342 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 77.390625, |
| "epoch": 171.5, |
| "grad_norm": 5.299938896991584, |
| "kl": 2.30859375, |
| "learning_rate": 1.2118288733803474e-06, |
| "loss": 0.1981, |
| "reward": 1.828125, |
| "reward_std": 0.4453761428594589, |
| "rewards/accuracy_reward_staging": 0.0, |
| "rewards/format_reward": 0.90625, |
| "rewards/format_reward_staging": 0.921875, |
| "step": 343 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 233.1875, |
| "epoch": 172.0, |
| "grad_norm": 9.72856878961463, |
| "kl": 2.810546875, |
| "learning_rate": 1.1705240714107301e-06, |
| "loss": 0.3691, |
| "reward": 7.234375, |
| "reward_std": 4.719546392560005, |
| "rewards/accuracy_reward_staging": 0.609375, |
| "rewards/format_reward": 0.5625, |
| "rewards/format_reward_staging": 0.578125, |
| "step": 344 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 156.34375, |
| "epoch": 172.5, |
| "grad_norm": 13.674261851507499, |
| "kl": 5.080078125, |
| "learning_rate": 1.129891668217783e-06, |
| "loss": 0.3796, |
| "reward": 1.515625, |
| "reward_std": 0.3468210846185684, |
| "rewards/accuracy_reward_staging": 0.0, |
| "rewards/format_reward": 0.75, |
| "rewards/format_reward_staging": 0.765625, |
| "step": 345 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 125.0625, |
| "epoch": 173.0, |
| "grad_norm": 17.14371369730622, |
| "kl": 3.12109375, |
| "learning_rate": 1.0899347581163222e-06, |
| "loss": 0.491, |
| "reward": 8.140625, |
| "reward_std": 1.9854381084442139, |
| "rewards/accuracy_reward_staging": 0.671875, |
| "rewards/format_reward": 0.703125, |
| "rewards/format_reward_staging": 0.71875, |
| "step": 346 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 193.859375, |
| "epoch": 173.5, |
| "grad_norm": 12.20716781608, |
| "kl": 3.224609375, |
| "learning_rate": 1.0506563839797501e-06, |
| "loss": 0.0906, |
| "reward": 3.6875, |
| "reward_std": 3.41329425573349, |
| "rewards/accuracy_reward_staging": 0.234375, |
| "rewards/format_reward": 0.671875, |
| "rewards/format_reward_staging": 0.671875, |
| "step": 347 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 152.8125, |
| "epoch": 174.0, |
| "grad_norm": 5.4458912166318, |
| "kl": 6.1875, |
| "learning_rate": 1.012059537008332e-06, |
| "loss": 0.2197, |
| "reward": 4.15625, |
| "reward_std": 2.4828383028507233, |
| "rewards/accuracy_reward_staging": 0.296875, |
| "rewards/format_reward": 0.546875, |
| "rewards/format_reward_staging": 0.640625, |
| "step": 348 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 123.328125, |
| "epoch": 174.5, |
| "grad_norm": 11.502073299068844, |
| "kl": 5.08203125, |
| "learning_rate": 9.74147156501396e-07, |
| "loss": 0.233, |
| "reward": 4.921875, |
| "reward_std": 2.9725054800510406, |
| "rewards/accuracy_reward_staging": 0.359375, |
| "rewards/format_reward": 0.65625, |
| "rewards/format_reward_staging": 0.671875, |
| "step": 349 |
| }, |
| { |
| "epoch": 175.0, |
| "grad_norm": 41.38843600405391, |
| "learning_rate": 9.369221296335007e-07, |
| "loss": 0.3717, |
| "step": 350 |
| }, |
| { |
| "epoch": 175.0, |
| "eval_clip_ratio": 0.0, |
| "eval_completion_length": 150.5703125, |
| "eval_kl": 5.74609375, |
| "eval_loss": 0.37665992975234985, |
| "eval_reward": 3.9921875, |
| "eval_reward_std": 2.3707948103547096, |
| "eval_rewards/accuracy_reward_staging": 0.265625, |
| "eval_rewards/format_reward": 0.671875, |
| "eval_rewards/format_reward_staging": 0.6640625, |
| "eval_runtime": 29.995, |
| "eval_samples_per_second": 0.267, |
| "eval_steps_per_second": 0.033, |
| "step": 350 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 176.296875, |
| "epoch": 175.5, |
| "grad_norm": 14.284328609401078, |
| "kl": 2.892578125, |
| "learning_rate": 9.00387291234569e-07, |
| "loss": 0.3948, |
| "reward": 5.5390625, |
| "reward_std": 2.7443336844444275, |
| "rewards/accuracy_reward_staging": 0.4140625, |
| "rewards/format_reward": 0.6953125, |
| "rewards/format_reward_staging": 0.703125, |
| "step": 351 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 184.53125, |
| "epoch": 176.0, |
| "grad_norm": 59.21817249281153, |
| "kl": 1.8359375, |
| "learning_rate": 8.645454235739903e-07, |
| "loss": 0.94, |
| "reward": 1.421875, |
| "reward_std": 0.3859764039516449, |
| "rewards/accuracy_reward_staging": 0.0, |
| "rewards/format_reward": 0.6875, |
| "rewards/format_reward_staging": 0.734375, |
| "step": 352 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 225.109375, |
| "epoch": 176.5, |
| "grad_norm": 5.721722376999943, |
| "kl": 3.16015625, |
| "learning_rate": 8.293992561487596e-07, |
| "loss": 0.187, |
| "reward": 5.75, |
| "reward_std": 1.985233724117279, |
| "rewards/accuracy_reward_staging": 0.46875, |
| "rewards/format_reward": 0.515625, |
| "rewards/format_reward_staging": 0.546875, |
| "step": 353 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 95.09375, |
| "epoch": 177.0, |
| "grad_norm": 11.943217011119804, |
| "kl": 1.8671875, |
| "learning_rate": 7.949514654755963e-07, |
| "loss": 0.5445, |
| "reward": 3.546875, |
| "reward_std": 2.243361175060272, |
| "rewards/accuracy_reward_staging": 0.171875, |
| "rewards/format_reward": 0.90625, |
| "rewards/format_reward_staging": 0.921875, |
| "step": 354 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 159.359375, |
| "epoch": 177.5, |
| "grad_norm": 7.480612147716681, |
| "kl": 8.45703125, |
| "learning_rate": 7.612046748871327e-07, |
| "loss": 0.3634, |
| "reward": 1.421875, |
| "reward_std": 1.0414101481437683, |
| "rewards/accuracy_reward_staging": 0.015625, |
| "rewards/format_reward": 0.625, |
| "rewards/format_reward_staging": 0.640625, |
| "step": 355 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 208.21875, |
| "epoch": 178.0, |
| "grad_norm": 18.429391483064126, |
| "kl": 2.62109375, |
| "learning_rate": 7.281614543321269e-07, |
| "loss": 0.5711, |
| "reward": 7.0625, |
| "reward_std": 3.4084277749061584, |
| "rewards/accuracy_reward_staging": 0.578125, |
| "rewards/format_reward": 0.625, |
| "rewards/format_reward_staging": 0.65625, |
| "step": 356 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 252.484375, |
| "epoch": 178.5, |
| "grad_norm": 14.537737450397852, |
| "kl": 4.646484375, |
| "learning_rate": 6.958243201797554e-07, |
| "loss": 0.1274, |
| "reward": 5.28125, |
| "reward_std": 3.292633891105652, |
| "rewards/accuracy_reward_staging": 0.4375, |
| "rewards/format_reward": 0.453125, |
| "rewards/format_reward_staging": 0.453125, |
| "step": 357 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 92.875, |
| "epoch": 179.0, |
| "grad_norm": 37.718281676975764, |
| "kl": 7.54296875, |
| "learning_rate": 6.641957350279838e-07, |
| "loss": 0.2581, |
| "reward": 1.765625, |
| "reward_std": 0.5727441757917404, |
| "rewards/accuracy_reward_staging": 0.0, |
| "rewards/format_reward": 0.875, |
| "rewards/format_reward_staging": 0.890625, |
| "step": 358 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 151.78125, |
| "epoch": 179.5, |
| "grad_norm": 6.103859851208992, |
| "kl": 2.55078125, |
| "learning_rate": 6.332781075160244e-07, |
| "loss": 0.1261, |
| "reward": 3.34375, |
| "reward_std": 3.3706541061401367, |
| "rewards/accuracy_reward_staging": 0.1875, |
| "rewards/format_reward": 0.71875, |
| "rewards/format_reward_staging": 0.75, |
| "step": 359 |
| }, |
| { |
| "epoch": 180.0, |
| "grad_norm": 16.044853615424703, |
| "learning_rate": 6.030737921409169e-07, |
| "loss": 0.4139, |
| "step": 360 |
| }, |
| { |
| "epoch": 180.0, |
| "eval_clip_ratio": 0.0, |
| "eval_completion_length": 120.9140625, |
| "eval_kl": 2.34375, |
| "eval_loss": 0.017587810754776, |
| "eval_reward": 3.7421875, |
| "eval_reward_std": 1.6082397252321243, |
| "eval_rewards/accuracy_reward_staging": 0.234375, |
| "eval_rewards/format_reward": 0.6796875, |
| "eval_rewards/format_reward_staging": 0.71875, |
| "eval_runtime": 19.8514, |
| "eval_samples_per_second": 0.403, |
| "eval_steps_per_second": 0.05, |
| "step": 360 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 156.7421875, |
| "epoch": 180.5, |
| "grad_norm": 75.37216219372046, |
| "kl": 4.494140625, |
| "learning_rate": 5.735850890782158e-07, |
| "loss": 0.1449, |
| "reward": 6.59375, |
| "reward_std": 3.172459162771702, |
| "rewards/accuracy_reward_staging": 0.4921875, |
| "rewards/format_reward": 0.828125, |
| "rewards/format_reward_staging": 0.84375, |
| "step": 361 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 247.46875, |
| "epoch": 181.0, |
| "grad_norm": 12.075035951064045, |
| "kl": 3.07421875, |
| "learning_rate": 5.448142440068316e-07, |
| "loss": 0.1604, |
| "reward": 1.09375, |
| "reward_std": 0.9126968681812286, |
| "rewards/accuracy_reward_staging": 0.015625, |
| "rewards/format_reward": 0.46875, |
| "rewards/format_reward_staging": 0.46875, |
| "step": 362 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 155.796875, |
| "epoch": 181.5, |
| "grad_norm": 64.11377087450238, |
| "kl": 3.0390625, |
| "learning_rate": 5.167634479380068e-07, |
| "loss": 0.8806, |
| "reward": 1.984375, |
| "reward_std": 1.6587499380111694, |
| "rewards/accuracy_reward_staging": 0.078125, |
| "rewards/format_reward": 0.59375, |
| "rewards/format_reward_staging": 0.609375, |
| "step": 363 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 175.96875, |
| "epoch": 182.0, |
| "grad_norm": 14.895018127249944, |
| "kl": 3.53125, |
| "learning_rate": 4.894348370484648e-07, |
| "loss": 0.2306, |
| "reward": 4.734375, |
| "reward_std": 3.2918047457933426, |
| "rewards/accuracy_reward_staging": 0.34375, |
| "rewards/format_reward": 0.65625, |
| "rewards/format_reward_staging": 0.640625, |
| "step": 364 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 99.234375, |
| "epoch": 182.5, |
| "grad_norm": 7.957954932541399, |
| "kl": 2.3203125, |
| "learning_rate": 4.628304925177318e-07, |
| "loss": 0.044, |
| "reward": 7.359375, |
| "reward_std": 3.293235272169113, |
| "rewards/accuracy_reward_staging": 0.546875, |
| "rewards/format_reward": 0.9375, |
| "rewards/format_reward_staging": 0.953125, |
| "step": 365 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 212.15625, |
| "epoch": 183.0, |
| "grad_norm": 13.244592420338774, |
| "kl": 4.23046875, |
| "learning_rate": 4.3695244036964567e-07, |
| "loss": 0.4, |
| "reward": 0.859375, |
| "reward_std": 0.5324036777019501, |
| "rewards/accuracy_reward_staging": 0.0, |
| "rewards/format_reward": 0.421875, |
| "rewards/format_reward_staging": 0.4375, |
| "step": 366 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 82.6875, |
| "epoch": 183.5, |
| "grad_norm": 23.479873436751383, |
| "kl": 13.568359375, |
| "learning_rate": 4.118026513180695e-07, |
| "loss": 0.286, |
| "reward": 6.6875, |
| "reward_std": 3.936906337738037, |
| "rewards/accuracy_reward_staging": 0.484375, |
| "rewards/format_reward": 0.9375, |
| "rewards/format_reward_staging": 0.90625, |
| "step": 367 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 220.9375, |
| "epoch": 184.0, |
| "grad_norm": 10.070112240993417, |
| "kl": 2.416015625, |
| "learning_rate": 3.8738304061681107e-07, |
| "loss": 0.2163, |
| "reward": 1.09375, |
| "reward_std": 0.8050735592842102, |
| "rewards/accuracy_reward_staging": 0.015625, |
| "rewards/format_reward": 0.46875, |
| "rewards/format_reward_staging": 0.46875, |
| "step": 368 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 88.8125, |
| "epoch": 184.5, |
| "grad_norm": 8.136056851141001, |
| "kl": 2.525390625, |
| "learning_rate": 3.6369546791377054e-07, |
| "loss": 0.1381, |
| "reward": 6.765625, |
| "reward_std": 4.866852879524231, |
| "rewards/accuracy_reward_staging": 0.53125, |
| "rewards/format_reward": 0.71875, |
| "rewards/format_reward_staging": 0.734375, |
| "step": 369 |
| }, |
| { |
| "epoch": 185.0, |
| "grad_norm": 17.037859451924284, |
| "learning_rate": 3.4074173710931804e-07, |
| "loss": 0.3005, |
| "step": 370 |
| }, |
| { |
| "epoch": 185.0, |
| "eval_clip_ratio": 0.0, |
| "eval_completion_length": 144.296875, |
| "eval_kl": 3.22265625, |
| "eval_loss": 0.20356737077236176, |
| "eval_reward": 3.8203125, |
| "eval_reward_std": 2.0116966366767883, |
| "eval_rewards/accuracy_reward_staging": 0.25, |
| "eval_rewards/format_reward": 0.6484375, |
| "eval_rewards/format_reward_staging": 0.671875, |
| "eval_runtime": 30.7935, |
| "eval_samples_per_second": 0.26, |
| "eval_steps_per_second": 0.032, |
| "step": 370 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 147.8671875, |
| "epoch": 185.5, |
| "grad_norm": 18.635491260401505, |
| "kl": 4.578125, |
| "learning_rate": 3.185235962189237e-07, |
| "loss": 0.3295, |
| "reward": 3.8046875, |
| "reward_std": 1.7106563821434975, |
| "rewards/accuracy_reward_staging": 0.2421875, |
| "rewards/format_reward": 0.6796875, |
| "rewards/format_reward_staging": 0.703125, |
| "step": 371 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 140.125, |
| "epoch": 186.0, |
| "grad_norm": 14.577568071606951, |
| "kl": 3.724609375, |
| "learning_rate": 2.970427372400353e-07, |
| "loss": 0.0855, |
| "reward": 1.4375, |
| "reward_std": 0.6442917585372925, |
| "rewards/accuracy_reward_staging": 0.0, |
| "rewards/format_reward": 0.703125, |
| "rewards/format_reward_staging": 0.734375, |
| "step": 372 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 184.890625, |
| "epoch": 186.5, |
| "grad_norm": 58.14646429098023, |
| "kl": 1.859375, |
| "learning_rate": 2.7630079602323447e-07, |
| "loss": 0.4805, |
| "reward": 7.09375, |
| "reward_std": 3.6945143938064575, |
| "rewards/accuracy_reward_staging": 0.5625, |
| "rewards/format_reward": 0.71875, |
| "rewards/format_reward_staging": 0.75, |
| "step": 373 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 97.53125, |
| "epoch": 187.0, |
| "grad_norm": 17.627898991545116, |
| "kl": 3.81640625, |
| "learning_rate": 2.5629935214764866e-07, |
| "loss": 0.5385, |
| "reward": 1.5625, |
| "reward_std": 0.9268264472484589, |
| "rewards/accuracy_reward_staging": 0.015625, |
| "rewards/format_reward": 0.6875, |
| "rewards/format_reward_staging": 0.71875, |
| "step": 374 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 183.8125, |
| "epoch": 187.5, |
| "grad_norm": 9.59043190634086, |
| "kl": 3.857421875, |
| "learning_rate": 2.370399288006664e-07, |
| "loss": 0.1804, |
| "reward": 2.703125, |
| "reward_std": 3.5622373819351196, |
| "rewards/accuracy_reward_staging": 0.171875, |
| "rewards/format_reward": 0.484375, |
| "rewards/format_reward_staging": 0.5, |
| "step": 375 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 113.0625, |
| "epoch": 188.0, |
| "grad_norm": 16.299906961139566, |
| "kl": 2.330078125, |
| "learning_rate": 2.1852399266194312e-07, |
| "loss": 0.4884, |
| "reward": 6.109375, |
| "reward_std": 1.688461884856224, |
| "rewards/accuracy_reward_staging": 0.4375, |
| "rewards/format_reward": 0.859375, |
| "rewards/format_reward_staging": 0.875, |
| "step": 376 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 182.75, |
| "epoch": 188.5, |
| "grad_norm": 8.539805865049278, |
| "kl": 8.970703125, |
| "learning_rate": 2.0075295379170413e-07, |
| "loss": 0.3236, |
| "reward": 1.71875, |
| "reward_std": 2.325968086719513, |
| "rewards/accuracy_reward_staging": 0.078125, |
| "rewards/format_reward": 0.46875, |
| "rewards/format_reward_staging": 0.46875, |
| "step": 377 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 73.34375, |
| "epoch": 189.0, |
| "grad_norm": 10.441791159370938, |
| "kl": 2.33203125, |
| "learning_rate": 1.8372816552336025e-07, |
| "loss": 0.1111, |
| "reward": 7.28125, |
| "reward_std": 3.7130661606788635, |
| "rewards/accuracy_reward_staging": 0.546875, |
| "rewards/format_reward": 0.890625, |
| "rewards/format_reward_staging": 0.921875, |
| "step": 378 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 140.890625, |
| "epoch": 189.5, |
| "grad_norm": 15.428094188720301, |
| "kl": 4.2265625, |
| "learning_rate": 1.6745092436045495e-07, |
| "loss": 0.0711, |
| "reward": 1.875, |
| "reward_std": 1.2960638105869293, |
| "rewards/accuracy_reward_staging": 0.046875, |
| "rewards/format_reward": 0.6875, |
| "rewards/format_reward_staging": 0.71875, |
| "step": 379 |
| }, |
| { |
| "epoch": 190.0, |
| "grad_norm": 7.7297837980318445, |
| "learning_rate": 1.519224698779198e-07, |
| "loss": 0.2035, |
| "step": 380 |
| }, |
| { |
| "epoch": 190.0, |
| "eval_clip_ratio": 0.0, |
| "eval_completion_length": 101.8515625, |
| "eval_kl": 16.283203125, |
| "eval_loss": 0.3184351921081543, |
| "eval_reward": 3.796875, |
| "eval_reward_std": 3.2891100347042084, |
| "eval_rewards/accuracy_reward_staging": 0.2421875, |
| "eval_rewards/format_reward": 0.6875, |
| "eval_rewards/format_reward_staging": 0.6875, |
| "eval_runtime": 28.2419, |
| "eval_samples_per_second": 0.283, |
| "eval_steps_per_second": 0.035, |
| "step": 380 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 93.7890625, |
| "epoch": 190.5, |
| "grad_norm": 6.692292880766585, |
| "kl": 3.47265625, |
| "learning_rate": 1.3714398462768563e-07, |
| "loss": 0.0341, |
| "reward": 3.6484375, |
| "reward_std": 2.5661590471863747, |
| "rewards/accuracy_reward_staging": 0.2265625, |
| "rewards/format_reward": 0.6953125, |
| "rewards/format_reward_staging": 0.6875, |
| "step": 381 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 126.25, |
| "epoch": 191.0, |
| "grad_norm": 12.116075689587404, |
| "kl": 6.255859375, |
| "learning_rate": 1.231165940486234e-07, |
| "loss": 0.0657, |
| "reward": 5.890625, |
| "reward_std": 2.3291388154029846, |
| "rewards/accuracy_reward_staging": 0.453125, |
| "rewards/format_reward": 0.671875, |
| "rewards/format_reward_staging": 0.6875, |
| "step": 382 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 180.640625, |
| "epoch": 191.5, |
| "grad_norm": 12.664375356889414, |
| "kl": 5.28515625, |
| "learning_rate": 1.0984136638083176e-07, |
| "loss": 0.3136, |
| "reward": 2.53125, |
| "reward_std": 3.584800824522972, |
| "rewards/accuracy_reward_staging": 0.15625, |
| "rewards/format_reward": 0.484375, |
| "rewards/format_reward_staging": 0.484375, |
| "step": 383 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 64.40625, |
| "epoch": 192.0, |
| "grad_norm": 4.339973094865359, |
| "kl": 1.96484375, |
| "learning_rate": 9.731931258429638e-08, |
| "loss": -0.0634, |
| "reward": 5.75, |
| "reward_std": 1.7209889590740204, |
| "rewards/accuracy_reward_staging": 0.390625, |
| "rewards/format_reward": 0.90625, |
| "rewards/format_reward_staging": 0.9375, |
| "step": 384 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 118.625, |
| "epoch": 192.5, |
| "grad_norm": 11.3243870187843, |
| "kl": 2.5078125, |
| "learning_rate": 8.555138626189619e-08, |
| "loss": 0.3026, |
| "reward": 5.828125, |
| "reward_std": 3.4805214405059814, |
| "rewards/accuracy_reward_staging": 0.4375, |
| "rewards/format_reward": 0.71875, |
| "rewards/format_reward_staging": 0.734375, |
| "step": 385 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 160.5, |
| "epoch": 193.0, |
| "grad_norm": 13.116416295027339, |
| "kl": 7.072265625, |
| "learning_rate": 7.453848358678018e-08, |
| "loss": 0.4106, |
| "reward": 1.515625, |
| "reward_std": 0.8902590423822403, |
| "rewards/accuracy_reward_staging": 0.015625, |
| "rewards/format_reward": 0.65625, |
| "rewards/format_reward_staging": 0.703125, |
| "step": 386 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 70.296875, |
| "epoch": 193.5, |
| "grad_norm": 25.501237737582994, |
| "kl": 4.64453125, |
| "learning_rate": 6.428144323412544e-08, |
| "loss": 0.0485, |
| "reward": 1.6875, |
| "reward_std": 0.6990881264209747, |
| "rewards/accuracy_reward_staging": 0.0, |
| "rewards/format_reward": 0.84375, |
| "rewards/format_reward_staging": 0.84375, |
| "step": 387 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 134.8125, |
| "epoch": 194.0, |
| "grad_norm": 11.249206018043907, |
| "kl": 4.14453125, |
| "learning_rate": 5.4781046317267103e-08, |
| "loss": 0.1459, |
| "reward": 6.109375, |
| "reward_std": 3.1836692690849304, |
| "rewards/accuracy_reward_staging": 0.515625, |
| "rewards/format_reward": 0.46875, |
| "rewards/format_reward_staging": 0.484375, |
| "step": 388 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 98.65625, |
| "epoch": 194.5, |
| "grad_norm": 29.24345766629281, |
| "kl": 15.025390625, |
| "learning_rate": 4.603801632821148e-08, |
| "loss": 0.1154, |
| "reward": 1.859375, |
| "reward_std": 2.3215357810258865, |
| "rewards/accuracy_reward_staging": 0.046875, |
| "rewards/format_reward": 0.703125, |
| "rewards/format_reward_staging": 0.6875, |
| "step": 389 |
| }, |
| { |
| "epoch": 195.0, |
| "grad_norm": 9.771374335396073, |
| "learning_rate": 3.805301908254455e-08, |
| "loss": 0.0831, |
| "step": 390 |
| }, |
| { |
| "epoch": 195.0, |
| "eval_clip_ratio": 0.0, |
| "eval_completion_length": 118.71875, |
| "eval_kl": 5.439453125, |
| "eval_loss": 0.20848847925662994, |
| "eval_reward": 3.8515625, |
| "eval_reward_std": 2.876259058713913, |
| "eval_rewards/accuracy_reward_staging": 0.25, |
| "eval_rewards/format_reward": 0.6640625, |
| "eval_rewards/format_reward_staging": 0.6875, |
| "eval_runtime": 28.0016, |
| "eval_samples_per_second": 0.286, |
| "eval_steps_per_second": 0.036, |
| "step": 390 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 131.8515625, |
| "epoch": 195.5, |
| "grad_norm": 18.170540683038464, |
| "kl": 4.927734375, |
| "learning_rate": 3.082666266872036e-08, |
| "loss": 0.3564, |
| "reward": 4.40625, |
| "reward_std": 2.274537533521652, |
| "rewards/accuracy_reward_staging": 0.3046875, |
| "rewards/format_reward": 0.671875, |
| "rewards/format_reward_staging": 0.6875, |
| "step": 391 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 89.9375, |
| "epoch": 196.0, |
| "grad_norm": 37.59210149762812, |
| "kl": 2.451171875, |
| "learning_rate": 2.4359497401758026e-08, |
| "loss": 0.3768, |
| "reward": 5.59375, |
| "reward_std": 2.1195763051509857, |
| "rewards/accuracy_reward_staging": 0.421875, |
| "rewards/format_reward": 0.671875, |
| "rewards/format_reward_staging": 0.703125, |
| "step": 392 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 249.640625, |
| "epoch": 196.5, |
| "grad_norm": 10.008461927031737, |
| "kl": 10.494140625, |
| "learning_rate": 1.86520157813308e-08, |
| "loss": 0.147, |
| "reward": 2.078125, |
| "reward_std": 1.580733835697174, |
| "rewards/accuracy_reward_staging": 0.109375, |
| "rewards/format_reward": 0.484375, |
| "rewards/format_reward_staging": 0.5, |
| "step": 393 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 70.09375, |
| "epoch": 197.0, |
| "grad_norm": 45.873680058130404, |
| "kl": 2.017578125, |
| "learning_rate": 1.370465245426167e-08, |
| "loss": 0.7853, |
| "reward": 5.8125, |
| "reward_std": 1.908312439918518, |
| "rewards/accuracy_reward_staging": 0.390625, |
| "rewards/format_reward": 0.953125, |
| "rewards/format_reward_staging": 0.953125, |
| "step": 394 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 75.359375, |
| "epoch": 197.5, |
| "grad_norm": 8.377325192503205, |
| "kl": 3.708984375, |
| "learning_rate": 9.517784181422018e-09, |
| "loss": 0.191, |
| "reward": 2.875, |
| "reward_std": 2.159477174282074, |
| "rewards/accuracy_reward_staging": 0.109375, |
| "rewards/format_reward": 0.890625, |
| "rewards/format_reward_staging": 0.890625, |
| "step": 395 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 283.65625, |
| "epoch": 198.0, |
| "grad_norm": 6.244882431212038, |
| "kl": 2.7890625, |
| "learning_rate": 6.091729809042379e-09, |
| "loss": 0.1084, |
| "reward": 5.40625, |
| "reward_std": 2.89695280790329, |
| "rewards/accuracy_reward_staging": 0.4375, |
| "rewards/format_reward": 0.515625, |
| "rewards/format_reward_staging": 0.515625, |
| "step": 396 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 106.40625, |
| "epoch": 198.5, |
| "grad_norm": 38.143536821255225, |
| "kl": 2.380859375, |
| "learning_rate": 3.4267502444274013e-09, |
| "loss": 0.6792, |
| "reward": 6.21875, |
| "reward_std": 3.516386479139328, |
| "rewards/accuracy_reward_staging": 0.4375, |
| "rewards/format_reward": 0.90625, |
| "rewards/format_reward_staging": 0.9375, |
| "step": 397 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 156.4375, |
| "epoch": 199.0, |
| "grad_norm": 14.86370731226592, |
| "kl": 3.87890625, |
| "learning_rate": 1.5230484360873043e-09, |
| "loss": 0.1544, |
| "reward": 1.0625, |
| "reward_std": 0.9951936304569244, |
| "rewards/accuracy_reward_staging": 0.015625, |
| "rewards/format_reward": 0.4375, |
| "rewards/format_reward_staging": 0.46875, |
| "step": 398 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 68.96875, |
| "epoch": 199.5, |
| "grad_norm": 3.6909839942424423, |
| "kl": 2.57421875, |
| "learning_rate": 3.807693582869032e-10, |
| "loss": -0.0079, |
| "reward": 3.234375, |
| "reward_std": 2.6320499926805496, |
| "rewards/accuracy_reward_staging": 0.140625, |
| "rewards/format_reward": 0.921875, |
| "rewards/format_reward_staging": 0.90625, |
| "step": 399 |
| }, |
| { |
| "epoch": 200.0, |
| "grad_norm": 29.889890508013348, |
| "learning_rate": 0.0, |
| "loss": 0.6276, |
| "step": 400 |
| }, |
| { |
| "epoch": 200.0, |
| "eval_clip_ratio": 0.0, |
| "eval_completion_length": 155.21875, |
| "eval_kl": 4.8115234375, |
| "eval_loss": 0.44944292306900024, |
| "eval_reward": 4.1171875, |
| "eval_reward_std": 2.682509124279022, |
| "eval_rewards/accuracy_reward_staging": 0.28125, |
| "eval_rewards/format_reward": 0.6328125, |
| "eval_rewards/format_reward_staging": 0.671875, |
| "eval_runtime": 36.2396, |
| "eval_samples_per_second": 0.221, |
| "eval_steps_per_second": 0.028, |
| "step": 400 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 306.3125, |
| "epoch": 200.0, |
| "kl": 5.7890625, |
| "reward": 3.8125, |
| "reward_std": 2.403294324874878, |
| "rewards/accuracy_reward_staging": 0.296875, |
| "rewards/format_reward": 0.421875, |
| "rewards/format_reward_staging": 0.421875, |
| "step": 400, |
| "total_flos": 0.0, |
| "train_loss": 0.7732266664505005, |
| "train_runtime": 7747.2937, |
| "train_samples_per_second": 0.207, |
| "train_steps_per_second": 0.052 |
| } |
| ], |
| "logging_steps": 1, |
| "max_steps": 400, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 200, |
| "save_steps": 40, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 0.0, |
| "train_batch_size": 16, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|